def create_stations(file): feat_names, data = create_data_set.load_csv(file, True, dtype='str', delim=',', num_rows=1000000000) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def create_stations(file): feat_names, data = create_data_set.load_csv( file, True, dtype='str', delim=',', num_rows=1000000000 ) names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float)
def get_zipcode_wages(): income_fields, income_data = create_data_set.load_csv(file_name_income, dtype='string', return_data_frame=True) zipcode = income_data.ZipCode.values.astype(np.float) agi = income_data.AdjustedGrossIncome.values.astype('string') num_returns = income_data.NumberOfReturns.values.astype('string') i = find_first_element(zipcode, 90001) I = np.arange(i, zipcode.shape[0], 8) zipcode = zipcode[I].astype(np.int) agi = agi[I].astype(np.float) num_returns = num_returns[I].astype(np.float) ''' I = agi < 5000000 zipcode = zipcode[I] agi = agi[I] num_returns = num_returns[I] ''' mean_income = agi / num_returns I = num_returns > 50 d = dict(zip(zipcode[I], mean_income[I])) return d
def get_zipcode_wages(): income_fields, income_data = create_data_set.load_csv( file_name_income, dtype='string', return_data_frame=True) zipcode = income_data.ZipCode.values.astype(np.float) agi = income_data.AdjustedGrossIncome.values.astype('string') num_returns = income_data.NumberOfReturns.values.astype('string') i = find_first_element(zipcode, 90001) I = np.arange(i, zipcode.shape[0], 8) zipcode = zipcode[I].astype(np.int) agi = agi[I].astype(np.float) num_returns = num_returns[I].astype(np.float) ''' I = agi < 5000000 zipcode = zipcode[I] agi = agi[I] num_returns = num_returns[I] ''' mean_income = agi / num_returns I = (num_returns > 50) & (mean_income < np.percentile(mean_income, 99.6)) d = dict(zip(zipcode[I], mean_income[I])) return d
file, True, dtype='str', delim=',', num_rows=1000000000 ) inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str)) curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str) to_remove = array_functions.false(data_curr.shape[0]) for s in np.unique(curr_stations): if s not in unique_stations: continue print 'Found repeated station, removing: ' + s to_remove = to_remove | (curr_stations == s) data = np.vstack((data, data_curr[~to_remove,:])) y_names = ['TAVG', 'TMIN', 'TMAX', 'PRCP'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'DATE')] prev = '' date_str_to_idx = dict()
names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float) station_names, station_locs = create_stations(station_file_name) feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',', num_rows=1000000000) y_names = ['tripduration'] y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] date_strs = data[:, find_first_element(feat_names, 'starttime')] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str) a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str) #series_id = np.asarray([a + '-' + b for a,b in zip(a1,a2)])
feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = [s + ' Mean' for s in [ 'NO2', 'O3', 'SO2', 'CO', ]] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) to_keep = array_functions.false(data.shape[0]) date_strs = data[:, find_first_element(feat_names, 'Date Local')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() if prev != date_str: to_keep[i] = True prev = date_str data = data[to_keep, :] date_strs = date_strs[to_keep] date_ids = date_ids.astype(np.int)
year, month, day = [int(s) for s in a] d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = [s + ' Mean' for s in ['NO2', 'O3', 'SO2', 'CO', ]] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) to_keep = array_functions.false(data.shape[0]) date_strs = data[:, find_first_element(feat_names, 'Date Local')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() if prev != date_str: to_keep[i] = True prev = date_str data = data[to_keep, :] date_strs = date_strs[to_keep] date_ids = date_ids.astype(np.int)
d = date(int(year), int(month), int(day)) return d create_geospatial_data = True split_date = False file_name = 'kc_house_data.csv' save_data = True sampled_size = 1000 feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 suffix = '' if create_geospatial_data: x_feats = ['long', 'lat'] x_feat_inds = array_functions.find_set(feat_names, x_feats) x = data[:, x_feat_inds] x = array_functions.remove_quotes(x) x = x.astype(np.float) x[:, 0] = array_functions.normalize(x[:, 0]) x[:, 1] = array_functions.normalize(x[:, 1]) I = array_functions.is_in_percentile(x[:, 0], .01, .99) I &= array_functions.is_in_percentile(x[:, 1], .01, .99) x = x[I, :]
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype='str', delim=',', num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = np.zeros((num_days, num_locations)) locs = locs.astype(np.float) p_min = .3 p_max = .7 is_in_range = array_functions.is_in_percentile( locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) #array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list( itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date #y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) #y = trip_counts[first_tuesday_idx + 0, :].T ''' y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) ''' y1 = trip_counts[3:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) #y[y > 100] = 0 #y[y > 5000] = 0 #y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray( [str(xy) for xy in xy_bins])
delim=',', num_rows=1000000000) inds_to_use = np.asarray([ j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep ]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique( data[:, find_first_element(feat_names, 'STATION')].astype(np.str)) curr_stations = data_curr[:, find_first_element(feat_names, 'STATION' )].astype(np.str) to_remove = array_functions.false(data_curr.shape[0]) for s in np.unique(curr_stations): if s not in unique_stations: continue print 'Found repeated station, removing: ' + s to_remove = to_remove | (curr_stations == s) data = np.vstack((data, data_curr[~to_remove, :])) y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'DATE')] prev = '' date_str_to_idx = dict()
x = np.vstack((x1, x2)) y = np.concatenate((y1, y2)) data_set_ids = np.concatenate((np.zeros(y1.size), np.ones(y2.size))) data = data_lib.Data(x, y) data.data_set_ids = data_set_ids data.is_regression return data if use_zipcode_data: file = 'Zip_Zhvi_AllHomes.csv' data_fields, string_data = create_data_set.load_csv(file, has_field_names=True, dtype='string') zip_code = vec_remove_quotations(string_data[:, 1]).astype(np.int) state = vec_remove_quotations(string_data[:, 3]) # year1_idx = array_functions.find_first_element(data_fields, '1996-04') year1_idx = array_functions.find_first_element(data_fields, '2001-01') # year1_idx = array_functions.find_first_element(data_fields, '2016-02') year2_idx = array_functions.find_first_element(data_fields, '2017-02') pricing_data = string_data[:, [year1_idx, year2_idx]] pricing_data = vec_replace(pricing_data).astype(np.float) zipcode_location_map = get_zipcode_locations() locations = np.zeros((zip_code.size, 2)) for i, z in enumerate(zip_code): if z not in zipcode_location_map: print 'missing zipcode: ' + str(z) locations[i, :] = np.nan continue locations[i, :] = zipcode_location_map[z] all_states = np.unique(state) else:
day = 1 d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = ['Value'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'YYYYMM')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) to_keep = array_functions.true(date_strs.shape[0]) for i, date_str in enumerate(date_strs): if date_str[4:] == '13' or data[i, y_inds] == 'Not Available': to_keep[i] = False continue date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids[to_keep] data = data[to_keep, :] date_ids = date_ids.astype(np.int)
year, month, day = [int(s) for s in a] d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = ['NONE'] + ['D%d' % i for i in range(5)] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'validStart')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) series_id = data[:, find_first_element(feat_names, 'state')] + '-' + \
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = 0 * np.ones((num_days, num_locations)) locs = locs.astype(np.float) p_min = 0.3 p_max = 0.7 is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max ) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) # array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date # y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) # y = trip_counts[first_tuesday_idx + 0, :].T """ y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) """ y1 = trip_counts[:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) # y[y > 100] = 0 # y[y > 5000] = 0 # y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
year, month, day = [int(s) for s in a] d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = ['NONE'] + ['D%d' % i for i in range(5)] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'validStart')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) series_id = data[:, find_first_element(feat_names, 'state')] + '-' + \
names = data[:, array_functions.find_first_element(feat_names, 'station_id')] locs = data[:, array_functions.find_set(feat_names, ['long', 'lat'])] return names, locs.astype(np.float) station_names, station_locs = create_stations(station_file_name) feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', num_rows=1000000000 ) y_names = ['tripduration'] y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] date_strs = data[:, find_first_element(feat_names, 'starttime')] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int) a1 = data[:, find_first_element(feat_names, 'from_station_id')].astype(np.str) a2 = data[:, find_first_element(feat_names, 'to_station_id')].astype(np.str) #series_id = np.asarray([a + '-' + b for a,b in zip(a1,a2)])
day = 1 d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = ['Value'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'YYYYMM')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) to_keep = array_functions.true(date_strs.shape[0]) for i, date_str in enumerate(date_strs): if date_str[4:] == '13' or data[i, y_inds] == 'Not Available': to_keep[i] = False continue date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids[to_keep] data = data[to_keep, :] date_ids = date_ids.astype(np.int)
import numpy as np from data_sets import create_data_set from utility import array_functions from utility import helper_functions file_name = 'kc_house_data.csv' feat_names, data = create_data_set.load_csv(file_name, True, dtype='str', delim=',') feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long'] y_name = 'price' y_ind = array_functions.find_first_element(feat_names, y_name) y = data[:, y_ind].astype(np.float) y /= 100000 clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name]) x = data[:, ~clear_idx] x = array_functions.remove_quotes(x) x = x.astype(np.float) data = (x,y) helper_functions.save_object('processed_data.pkl', data) pass