def print_cluster_purity(k_means, X, Y, allow_recurse=True): cluster_inds = k_means.fit_predict(X) avg_variance = 0 for i in range(k_means.n_clusters): I = cluster_inds == i yi = Y[I] perc = I.mean() print 'Cluster ' + str(i) + ': perc=' + str(perc) print 'Mean: ' + str(yi.mean()) print 'STD: ' + str(yi.std()) if array_functions.in_range(yi.mean(), .2, .8) and I.mean() > .2 and allow_recurse: print 'Splitting cluster' k_means_sub = KMeans(n_clusters=4) Xi = X[I, :] Yi = Y[I] array_functions.plot_heatmap(Xi, Yi + .5, subtract_min=False) k_means_sub.fit(Xi) print_cluster_purity(k_means_sub, Xi, Yi, allow_recurse=False) avg_variance += I.mean() * yi.std() print 'Average STD: ' + str(avg_variance)
sampled_boolean = array_functions.false(I.size) sampled_boolean[sampled] = True I &= sampled_boolean I1 &= I I2 &= I ''' print 'n1: ' + str(I1.sum()) print 'n2: ' + str(I2.sum()) fig1 = pl.figure(3) dot_size = 30 array_functions.plot_heatmap(x[I1, :], y[I1], sizes=dot_size, alpha=1, subtract_min=False, fig=fig1) pl.title('Values 1') fig2 = pl.figure(4) array_functions.plot_heatmap(x[I2, :], y[I2], sizes=dot_size, alpha=1, subtract_min=False, fig=fig2) pl.title('Values 2') array_functions.move_fig(fig1, 500, 500, 2000, 100) array_functions.move_fig(fig2, 500, 500, 2600, 100) pl.show(block=True)
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype="str", delim=",", num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = 0 * np.ones((num_days, num_locations)) locs = locs.astype(np.float) p_min = 0.3 p_max = 0.7 is_in_range = array_functions.is_in_percentile(locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max ) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) # array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list(itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date # y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) # y = trip_counts[first_tuesday_idx + 0, :].T """ y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) """ y1 = trip_counts[:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) # y[y > 100] = 0 # y[y > 5000] = 0 # y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray([str(xy) for xy in xy_bins])
times_series_vals[times_series_vals < 0] = np.nan plot_2d = True plot_multiple_stations = True y_to_plot = 3 if plot_data: if plot_2d: for i in range(num_days): if use_monthly: y_val = times_series_vals[[i, i+4], :, y_to_plot] else: y_val = times_series_vals[[i,60+i],:,y_to_plot] y_val1 = times_series_vals[range(i,i+30),:,y_to_plot].mean(0) y_val2 = times_series_vals[range(i+120, i + 150), :, y_to_plot].mean(0) y_val = np.stack((y_val1, y_val2), 1).T array_functions.plot_heatmap(unique_locs,y_val.T,alpha=1,title=None,sizes=None,share_axis=True) elif plot_multiple_stations: for i in range(0,400, 10): is_in_state = np.arange(i,i+10) #y_val = times_series_vals[is_in_state, :800, 1].T y_val = times_series_vals[:,is_in_state[:], y_to_plot] x_val = range(y_val.shape[0]) #print unique_series_ids[to_use] for i, s in enumerate(unique_series_ids[is_in_state]): print str(i) + ': ' + s array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10) else: for i in range(times_series_vals.shape[1]): y_val = times_series_vals[:, i, :] x_val = np.arange(y_val.shape[0]) if not np.isfinite(y_val).sum(0).all():
''' for d in unique_dates: times_series_vals[d,i] = y[I[dates_idx == d]].mean() pass ''' ''' for j in I: print date_strs[j] ''' ''' print 'num_items: ' + str(I.size) print 'start: ' + date_strs[I[0]] print 'end: ' + date_strs[I[-1]] ''' has_loc = array_functions.false(unique_series_ids.size) for i, id in enumerate(unique_series_ids): has_loc[i] = id in station_names times_series_vals = times_series_vals[:, has_loc] unique_series_ids = unique_series_ids[has_loc] date_idx = 0 for i in range(0, num_days, 5): x = station_locs y = times_series_vals[i:120:28, :] array_functions.plot_heatmap(x, y.T, title=None, sizes=30) data = (times_series_vals, unique_series_ids) helper_functions.save_object('processed_data.pkl', data) pass
def vis_data(): s = 'data_sets/' + data_file_dir + '/raw_data.pkl' data = helper_functions.load_object(s) x = data.x y = data.y titles = ['', ''] label_idx = [0, 1] if plot_climate: img_path = 'C:/PythonFramework/far_transfer/figures/climate-terrain.png' image = imread(img_path) label_idx = [0, 4] if data_file_dir == 'climate-month': titles = [ 'Max Temperature Gradient: January', 'Max Temperature Gradient: April' ] label_idx = [0, 4] elif data_file_dir == 'irs-income': titles = ['Income', 'Household Size'] elif data_file_dir == 'zillow-traffic': titles = ['Morning Taxi Pickups', 'Housing Prices'] elif data_file_dir == 'kc-housing-spatial-floors': titles = ['House Prices: 1 Floor', 'House Prices: 2 or More Floors'] if plot_features: for i in range(data.p): xi = x[:, i] title = 'Feature Names Missing' if data.feature_names is not None: title = data.feature_names[i] array_functions.plot_2d(xi, y, data_set_ids=data.data_set_ids, title=title) else: for i, title in zip(label_idx, titles): #plt.close() I = data.data_set_ids == i if plot_gradients or plot_values: g, v = estimate_gradients(x, y, I) if plot_values: g = v #g = np.log(g) #g -= g.min() #g += g.max()/10.0 #g /= g.max() if data_file_dir == 'zillow-traffic': if i == 0: pass g -= g.min() g /= g.max() #g **= .5 else: pass g -= g.min() g /= g.max() #g **= .5 else: if i == 0: g -= g.min() g /= g.max() g = np.sqrt(g) else: g -= g.min() g /= g.max() g **= 1 #array_functions.plot_heatmap(g, sizes=dot_sizes, fig=fig, title=title) fig = plt.figure(i) plt.title(title) plt.axis('off') plt.imshow(g) array_functions.move_fig(fig, 750, 400) #plt.show(block=False) else: fig = plt.figure(4) array_functions.plot_heatmap(x[I, :], y[I], sizes=dot_sizes, fig=fig, title=title) if plot_climate: plt.imshow(image, zorder=0, extent=[-90, -78, 33.5, 38]) array_functions.move_fig(fig, 1400, 600) plt.show(block=True) pass
zipcode_array = np.zeros(len(zipcodes)) income_array = np.zeros(len(zipcodes)) locs = np.zeros((len(zipcodes), 2)) households = np.zeros(len(zipcodes)) for i, key in enumerate(zipcodes): zipcode_array[i] = key income_array[i] = zipcode_income[key] locs[i] = zipcode_locs[key] households[i] = zipcode_housing[key] income_array = np.log(income_array) income_array = array_functions.normalize(income_array) households = array_functions.normalize(households) locs[:, 0] = array_functions.normalize(locs[:, 0]) locs[:, 1] = array_functions.normalize(locs[:, 1]) #array_functions.plot_heatmap(locs, 10*income_array, sizes=50) #array_functions.plot_heatmap(locs, households, sizes=50) y = np.stack((income_array, households), 1) print 'Num Used: ' + str(y.shape[0]) array_functions.plot_heatmap(locs, y, sizes=100, share_axis=True) I = np.random.choice(y.shape[0], 400, replace=False) data = (locs[I, :], y[I], zipcode_array[I]) helper_functions.save_object('processed_data.pkl', data) pass
for d in unique_dates: times_series_vals[d,i] = y[I[dates_idx == d]].mean() pass ''' ''' for j in I: print date_strs[j] ''' ''' print 'num_items: ' + str(I.size) print 'start: ' + date_strs[I[0]] print 'end: ' + date_strs[I[-1]] ''' has_loc = array_functions.false(unique_series_ids.size) for i, id in enumerate(unique_series_ids): has_loc[i] = id in station_names times_series_vals = times_series_vals[:, has_loc] unique_series_ids = unique_series_ids[has_loc] date_idx = 0 for i in range(0,num_days, 5): x = station_locs y = times_series_vals[i:120:28,:] array_functions.plot_heatmap(x, y.T, title=None, sizes=30) data = (times_series_vals,unique_series_ids) helper_functions.save_object('processed_data.pkl', data) pass
def load_trip_data(file_names, y_names, time_name, loc_names, resolution=np.asarray([20, 20]), plot_data=True): resolution = np.asarray(resolution) feat_names = None data = None for file_name in file_names: curr_feat_names, curr_data = load_csv(file_name, True, dtype='str', delim=',', num_rows=1000000000) if feat_names is None: feat_names = curr_feat_names data = curr_data continue assert (feat_names == curr_feat_names).all() data = np.vstack((data, curr_data)) locs = data[:, array_functions.find_set(feat_names, loc_names)] y_inds = None if y_names is not None: y_inds = array_functions.find_set(feat_names, y_names).nonzero()[0] y = data[:, y_inds].astype(np.float) else: y = np.ones(data.shape[0]) date_strs = data[:, find_first_element(feat_names, time_name)] date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids.astype(np.int) min_date_id = date_ids.min() max_date_id = date_ids.max() num_days = max_date_id - min_date_id + 1 dates_idx = date_ids - min_date_id num_locations = np.prod(resolution) trip_counts = np.zeros((num_days, num_locations)) locs = locs.astype(np.float) p_min = .3 p_max = .7 is_in_range = array_functions.is_in_percentile( locs[:, 0], p_min, p_max) & array_functions.is_in_percentile( locs[:, 1], p_min, p_max) locs = locs[is_in_range, :] dates_idx = dates_idx[is_in_range] x_bins = quantize_loc(locs[:, 0], resolution[0]) y_bins = quantize_loc(locs[:, 1], resolution[1]) #array_functions.plot_2d(locs[I,0],locs[I,1]) xy_bins = list( itertools.product(range(resolution[0]), range(resolution[1]))) for x_idx, y_idx in xy_bins: is_in_cell = (x_bins == x_idx) & (y_bins == y_idx) trips_in_cell = dates_idx[is_in_cell] trip_dates, trips_per_date = np.unique(trips_in_cell, return_counts=True) bin_idx = bin_to_idx([x_idx, y_idx], resolution) trip_counts[trip_dates, bin_idx] = trips_per_date #y = trip_counts[[0, 3], :].T tuesday_saturday_idx = np.asarray([0, 4]) first_tuesday_idx = np.asarray([0, 154]) #y = trip_counts[first_tuesday_idx + 0, :].T ''' y1 = trip_counts[:30,:].sum(0) y2 = trip_counts[154:, :].sum(0) ''' y1 = trip_counts[3:30:7, :].mean(0) y2 = trip_counts[4:30:7, :].mean(0) y = np.stack((y1, y2), 1) #y[y > 100] = 0 #y[y > 5000] = 0 #y[y == y.max()] == 0 y = np.log(y) if plot_data: array_functions.plot_heatmap(np.asarray(xy_bins), y, sizes=50) return np.asarray(xy_bins, dtype=np.float), y, np.asarray( [str(xy) for xy in xy_bins])
vals_to_use = all_vals[:, is_all_finite] site_variance = vals_to_use.var(0) normalized_vals = vals_to_use.copy() for i in range(normalized_vals.shape[0]): v = normalized_vals[i, :] v -= v.mean() v /= v.var() normalized_vals[i, :] = v normalized_site_variance = normalized_vals.var(0) normalized_site_variance[normalized_site_variance > .35] = 1 #array_functions.plot_heatmap(unique_locs[is_all_finite], site_variance / site_variance.max(), alpha=1, title=None, sizes=None, share_axis=True) #normalized_site_variance += .5 array_functions.plot_heatmap(unique_locs[is_all_finite], normalized_site_variance / normalized_site_variance.max(), alpha=1, title=None, sizes=None, share_axis=True) print '' elif plot_2d: for i in range(num_time_intervals): if use_monthly: y_val = times_series_vals[[i, i + 1], :, y_to_plot] else: y_val = times_series_vals[[i, 60 + i], :, y_to_plot] y_val1 = times_series_vals[range(i, i + 30), :, y_to_plot].mean(0) y_val2 = times_series_vals[range(i + 120, i + 150), :, y_to_plot].mean(0) y_val = np.stack((y_val1, y_val2), 1).T
if use_log: day_values = np.log(day_values) night_values = np.log(night_values) else: suffix += '-noLog' if viz: fig1 = pl.figure(3) I = np.isfinite(day_values) I &= array_functions.in_range(day_values, min_value, max_value) if just_center_data: I = in_range(day_locs[:, 0], .2, .8) & in_range( day_locs[:, 1], .2, .8) array_functions.plot_heatmap(day_locs[I, :], day_values[I], sizes=dot_size, alpha=1, subtract_min=False, fig=fig1) pl.title('day values') fig2 = pl.figure(4) I = np.isfinite(night_values) I &= array_functions.in_range(night_values, min_value, max_value) if just_center_data: I = in_range(night_locs[:, 0], .2, .8) & in_range( night_locs[:, 1], .2, .8) array_functions.plot_heatmap(night_locs[I, :], night_values[I], sizes=dot_size, alpha=1, subtract_min=False,
for i, key in enumerate(zipcodes): zipcode_array[i] = key income_array[i] = zipcode_income[key] locs[i] = zipcode_locs[key] households[i] = zipcode_housing[key] income_array = np.log(income_array) income_array = array_functions.normalize(income_array) households = array_functions.normalize(households) locs[:,0] = array_functions.normalize(locs[:,0]) locs[:,1] = array_functions.normalize(locs[:,1]) #array_functions.plot_heatmap(locs, 10*income_array, sizes=50) #array_functions.plot_heatmap(locs, households, sizes=50) y = np.stack((income_array, households), 1) print 'Num Used: ' + str(y.shape[0]) array_functions.plot_heatmap( locs, y, sizes=100, share_axis=True ) I = np.random.choice(y.shape[0], 400, replace=False) data = (locs[I,:], y[I], zipcode_array[I]) helper_functions.save_object('processed_data.pkl', data) pass
target_relative = (source_errors.mean() - target_errors.mean())/target_errors.mean() source_relative = (stacking_errors.mean() - target_errors.mean()) / stacking_errors.mean() print 'Target ' + s + ': ' + str(target_relative) print 'Stacking ' + s + ': ' + str(source_relative) if source_relative > target_relative: print '!!!' exit() if not viz: for i in range(2): data.x[:,i] = array_functions.normalize(data.x[:,i]) print 'n: ' + str(data.n) if viz: I1 = data.data_set_ids == 0 I2 = data.data_set_ids == 1 fig1 = pl.figure(1) array_functions.plot_heatmap(data.x[I1, :], data.y[I1], sizes=30, alpha=1, subtract_min=True, fig=fig1) pl.xlabel('Longitude') pl.ylabel('Latitude') pl.title('Taxi Pickups') #pl.xticks([], []) #pl.yticks([], []) fig2 = pl.figure(2) array_functions.plot_heatmap(data.x[I2, :], data.y[I2], sizes=30, alpha=1, subtract_min=True, fig=fig2) pl.xlabel('Longitude') pl.ylabel('Latitude') pl.title('Housing Prices') pl.figure(1) pl.xlim(fig2.axes[0].get_xlim()) pl.ylim(fig2.axes[0].get_ylim()) #pl.xticks([], []) #pl.yticks([], [])