def create_pollution( labels_to_use=np.arange(2), series_to_use=0, num_instances=None, normalize_xy=True, save_data=True ): file = "pollution/processed_data.pkl" y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ": " + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(labels_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_xy: data.reset_x() data.normalize_y() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = "pollution-%d-%d" % (series_to_use, num_instances) else: s = "pollution-%d" % series_to_use if normalize_xy: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_synthetic_cross_transfer(): slope = 5 target_fun = lambda x: slope * x source_fun = lambda x: -slope * x + 5 data = create_synthetic_regression_transfer(target_fun, source_fun) s = synthetic_cross_file helper_functions.save_object(s, data)
def create_synthetic_flip_transfer(file_dir="", dim=1): n_target = 100 n_source = 100 n = n_target + n_source sigma = 0.2 data = data_class.Data() data.x = np.random.uniform(0, 1, (n, dim)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.zeros(n) data.y[(data.data_set_ids == 0) & (data.x[:, 0] >= 0.5)] = 2 data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= 0.5)] = 1 data.y[(data.data_set_ids == 0) & (data.x[:, 0] <= 0.5)] = 1 data.y[(data.data_set_ids == 1) & (data.x[:, 0] <= 0.5)] = 2 data.y += np.random.normal(0, sigma, n) data.set_train() data.set_true_y() data.is_regression = True if dim == 1: array_functions.plot_2d(data.x, data.y, data.data_set_ids) s = synthetic_flip_file if dim > 1: s = synthetic_step_kd_transfer_file % dim if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=0.3): wt = np.random.normal(0, sigma, p) all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt) x = all_data.x all_data.data_set_ids = np.zeros(n) wt = w_eff data_set_counter = 1 diffs = [] is_target = array_functions.false(kt + ks) is_target[:kt] = True all_data.true_w = np.zeros((ks + kt + 1, p)) all_data.true_w[0, :] = wt for i, val in enumerate(is_target): data_set_id = data_set_counter data_set_counter += 1 if val: ws = wt + np.random.normal(0, sigma_s, p) ws = wt else: ws = np.random.normal(0, sigma, p) source_data, ws = create_synthetic_linear_classification(w=ws, x=x) source_data.data_set_ids = data_set_id * np.ones(n) # source_data.true_y *= (i+2) source_data.y = source_data.true_y all_data.combine(source_data) diff = norm(wt / norm(wt) - ws / norm(ws)) diffs.append(diff) all_data.true_w[data_set_id, :] = ws all_data.true_w = all_data.true_w.T all_data.metadata = dict() all_data.metadata["true_w"] = all_data.true_w s = synthetic_hypothesis_transfer_class_file % ( str(n) + "-" + str(p) + "-" + str(sigma) + "-" + str(sigma_s) + "-" + str(kt) + "-" + str(ks) ) helper_functions.save_object(s, all_data)
def _run_experiment_args(self, results_file, data_and_splits, method_results, i_labels, split): num_labels = self.configs.num_labels[i_labels] s = str(num_labels) + '-' + str(split) curr_results = _load_temp_split_file(results_file, num_labels, split) if curr_results: return curr_results #print 'num_labels-split: ' + s temp_file_name = _temp_split_file_name(results_file, num_labels, split) temp_dir_root = helper_functions.remove_suffix(temp_file_name, '.pkl') temp_dir = temp_dir_root + '/CV-temp/' curr_data = data_and_splits.get_split(split, num_labels) learner = self.configs.learner curr_learner = copy.deepcopy(learner) curr_learner.split_idx_str = s curr_learner.temp_dir = temp_dir curr_results = curr_learner.train_and_test(curr_data) if mpi_utility.is_group_master(): helper_functions.save_object(_temp_split_file_name(results_file,num_labels,split),curr_results) helper_functions.delete_dir_if_exists(temp_dir_root) if mpi_utility.is_group_master(): if hasattr(curr_learner, 'best_params'): print s + '-' + str(curr_learner.best_params) + ' Error: ' + str(curr_results.compute_error(self.configs.loss_function)) else: print s + ' Done' return curr_results
def split_data(file, configs): data = helper_functions.load_object(file) splitter = DataSplitter() splitData = data_lib.SplitData() splitData.data = data num_splits = 30 perc_train = .8 keep_for_splitting = None if configs.split_data_set_ids is not None: keep_for_splitting = array_functions.false(data.n) keep_for_splitting[data.data_set_ids == 0] = True #Pretend data_set_ids is a label vector to ensure each data set is split equally if data.is_regression and data.data_set_ids is not None: assert len(data.data_set_ids) == data.n is_regression = False splitData.splits = splitter.generate_splits( data.data_set_ids, num_splits, perc_train, is_regression, keep_for_splitting ) else: splitData.splits = splitter.generate_splits( data.y, num_splits, perc_train, data.is_regression, keep_for_splitting ) splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep split_dir = os.path.dirname(file) save_file = split_dir + '/split_data.pkl' helper_functions.save_object(save_file,splitData) return splitData
def create_kc_housing(): file = "kc_housing/processed_data.pkl" x, y = helper_functions.load_object(file) data = data_class.Data(x, y) data.is_regression = True s = kc_housing_file helper_functions.save_object(s, data)
def create_synthetic_delta_linear_transfer(): slope = 5 target_fun = lambda x: slope * x source_fun = lambda x: slope * x + 4 data = create_synthetic_regression_transfer(target_fun, source_fun) array_functions.plot_2d(data.x, data.y, data.data_set_ids, title="Linear Delta Data Set") s = synthetic_delta_linear_file helper_functions.save_object(s, data)
def create_covtype(): covtype_data = datasets.fetch_covtype() print covtype_data.__dict__ data = data_class.Data() data.x = covtype_data.data data.y = covtype_data.target helper_functions.save_object("data_sets/covtype/raw_data.pkl") pass
def create_wine(data_to_create=WINE_RED): red_file = "wine/winequality-red.csv" white_file = "wine/winequality-white.csv" field_names, red_data = load_csv(red_file, delim=";") white_data = load_csv(white_file, delim=";")[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() # learner = None viz_features(x, y, ids, used_field_names, alpha=0.01, learner=learner) suffix = "transfer" else: if data_to_create == WINE_RED: wine_data = red_data suffix = "red" elif data_to_create == WINE_WHITE: wine_data = white_data suffix = "white" else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass # feat_idx = 1 # data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True """ data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) """ s = wine_file % ("-" + suffix) helper_functions.save_object(s, data)
def create_and_save_data(x, y, domain_ids, file): data = data_class.Data() data.x = array_functions.vec_to_2d(x) data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True data.data_set_ids = domain_ids helper_functions.save_object(file, data)
def create_time_series(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name='CO2_emissions'): file = name + '/processed_data.pkl' all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ': ' + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + '-%s-%d' % (str(series_to_use), num_instances) else: s = name + '-%s' % str(series_to_use) if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_spatial_data(dir='climate-month'): file = dir + '/processed_data.pkl' locs, y, ids = helper_functions.load_object(file) y = y.T is_missing_loc = (~np.isfinite(locs)).any(1) locs = locs[~is_missing_loc, :] y = y[~is_missing_loc, :] ids = ids[~is_missing_loc] data = data_class.Data(locs, y) data.multilabel_to_multisource() s = dir + '/raw_data.pkl' helper_functions.save_object(s, data)
def run_experiments(self): data_file = self.configs.data_file data_and_splits = helper_functions.load_object(data_file) data_and_splits.data.repair_data() assert self.configs.num_splits <= len(data_and_splits.splits) data_and_splits.labels_to_keep = self.configs.labels_to_keep data_and_splits.labels_to_not_sample = self.configs.labels_to_not_sample data_and_splits.target_labels = self.configs.target_labels data_and_splits.data.repair_data() results_file = self.configs.results_file comm = mpi_utility.get_comm() if os.path.isfile(results_file): if mpi_utility.is_group_master(): print results_file + ' already exists - skipping' return if mpi_utility.is_group_master(): hostname = helper_functions.get_hostname() print '(' + hostname + ') Running experiments: ' + results_file learner = self.configs.learner learner.run_pre_experiment_setup(data_and_splits) num_labels = len(self.configs.num_labels) num_splits = self.configs.num_splits #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits) method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits) for i, nl in enumerate(self.configs.num_labels): method_results.results_list[i].num_labels = nl split_idx = self.configs.split_idx if split_idx is not None: num_labels_list = list(itertools.product(range(num_labels), [split_idx])) else: num_labels_list = list(itertools.product(range(num_labels), range(num_splits))) shared_args = (self, results_file, data_and_splits, method_results) args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list] if self.configs.use_pool: pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size) all_results = pool.map(_run_experiment, args) else: all_results = [_run_experiment(a) for a in args] for curr_results,s in zip(all_results,num_labels_list): if curr_results is None: continue i_labels, split = s method_results.set(curr_results, i_labels, split) method_results.configs = self.configs if self.configs.should_load_temp_data: helper_functions.save_object(results_file,method_results) for i_labels, split in num_labels_list: num_labels = self.configs.num_labels[i_labels] _delete_temp_split_files(results_file, num_labels, split) _delete_temp_folder(results_file)
def create_spatial_data(dir="climate-month"): file = dir + "/processed_data.pkl" locs, y, ids = helper_functions.load_object(file) # y = y.T is_missing_loc = (~np.isfinite(locs)).any(1) locs = locs[~is_missing_loc, :] y = y[~is_missing_loc, :] ids = ids[~is_missing_loc] data = data_class.Data(locs, y) data.multilabel_to_multisource() s = dir + "/raw_data.pkl" helper_functions.save_object(s, data)
def subset_1_per_instance_id(): data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def create_concrete(transfer=False): file = 'concrete/Concrete_Data.csv' used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = '' if transfer: feat_ind = 0 domain_ind = (used_field_names == 'age').nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0:(concrete_data.shape[1] - 2)] #0,3,5 #data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = '-feat=' + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = '-' + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) #viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def subset_1_per_instance_id(): data = helper_functions.load_object( 'data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object( 'data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def create_synthetic_multitask_transfer(): slope_source = 8 target_slope1 = 4 target_slope2 = 4.5 source_func = lambda x: slope_source * x target_funcs = [ lambda x: target_slope1 * x + 3, lambda x: target_slope2 * x + 8 ] data = create_synthetic_regression_transfer(target_funcs, source_func) array_functions.plot_2d(data.x, data.y, data.data_set_ids, title='Multitask Slant') s = synthetic_slant_multitask helper_functions.save_object(s, data)
def create_concrete(transfer=False): file = "concrete/Concrete_Data.csv" used_field_names, concrete_data = load_csv(file) data = data_class.Data() t = "" if transfer: feat_ind = 0 domain_ind = (used_field_names == "age").nonzero()[0][0] ages = concrete_data[:, domain_ind] domain_ids = np.zeros(ages.shape) domain_ids[ages < 10] = 1 domain_ids[(ages >= 10) & (ages <= 28)] = 2 domain_ids[ages > 75] = 3 data.x = concrete_data[:, 0 : (concrete_data.shape[1] - 2)] # 0,3,5 # data.x = preprocessing.scale(data.x) if concrete_num_feats == 1: data.x = array_functions.vec_to_2d(data.x[:, feat_ind]) t = "-feat=" + str(feat_ind) elif concrete_num_feats >= data.x.shape[1]: t = "-" + str(min(data.x.shape[1], concrete_num_feats)) else: assert False data.data_set_ids = domain_ids else: data.x = concrete_data[:, 0:-1] data.y = concrete_data[:, -1] data.set_train() data.set_target() data.set_true_y() data.is_regression = True viz = False if viz: to_use = domain_ids > 0 domain_ids = domain_ids[to_use] concrete_data = concrete_data[to_use, :] np.delete(concrete_data, domain_ind, 1) viz_features(concrete_data, concrete_data[:, -1], domain_ids, used_field_names) return data.x = array_functions.standardize(data.x) # viz_features(data.x,data.y,data.data_set_ids) s = concrete_file % t helper_functions.save_object(s, data)
def create_drosophila(): data = helper_functions.load_object("drosophilia/processed_data.pkl") x, y = data y = np.reshape(y, y.shape[0]) I = np.random.choice(x.shape[0], size=500, replace=False) x = x[I, :] y = y[I] data = data_class.Data() data.x = x data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True helper_functions.save_object(drosophila_file, data)
def create_drosophila(): data = helper_functions.load_object('drosophilia/processed_data.pkl') x, y = data y = np.reshape(y, y.shape[0]) I = np.random.choice(x.shape[0], size=500, replace=False) x = x[I, :] y = y[I] data = data_class.Data() data.x = x data.y = y data.set_train() data.set_target() data.set_true_y() data.is_regression = True helper_functions.save_object(drosophila_file, data)
def create_time_series( label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True, name="CO2_emissions" ): file = name + "/processed_data.pkl" all_data = [] for i in series_to_use: y, ids = helper_functions.load_object(file) y_to_use = y[:, i, :] print str(i) + ": " + ids[i] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[i]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data = data.get_nth(7) data.reset_x() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) data = data.get_range([1000, 1500]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() try: if len(series_to_use) > 1: data.data_set_ids[:] = i except: pass all_data.append(data) # perc_used = data.get_perc_used() data = all_data[0] del all_data[0] for di in all_data: data.combine(di) if num_instances is not None: pass s = name + "-%s-%d" % (str(series_to_use), num_instances) else: s = name + "-%s" % str(series_to_use) if normalize_x: s += "-norm" s += "/raw_data.pkl" # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def run_experiments(self): data_file = self.configs.data_file data_and_splits = self.load_data_and_splits(data_file) results_file = self.configs.results_file comm = mpi_utility.get_comm() if os.path.isfile(results_file): if mpi_utility.is_group_master(): print results_file + ' already exists - skipping' return if mpi_utility.is_group_master(): hostname = helper_functions.get_hostname() print '(' + hostname + ') Running experiments: ' + results_file learner = self.configs.learner learner.run_pre_experiment_setup(data_and_splits) num_labels = len(self.configs.num_labels) num_splits = self.configs.num_splits #method_results = results.MethodResults(n_exp=num_labels, n_splits=num_splits) method_results = self.configs.method_results_class(n_exp=num_labels, n_splits=num_splits) for i, nl in enumerate(self.configs.num_labels): method_results.results_list[i].num_labels = nl split_idx = self.configs.split_idx if split_idx is not None: num_labels_list = list(itertools.product(range(num_labels), [split_idx])) else: num_labels_list = list(itertools.product(range(num_labels), range(num_splits))) shared_args = (self, results_file, data_and_splits, method_results) args = [shared_args + (i_labels, split) for i_labels,split in num_labels_list] if self.configs.use_pool: pool = multiprocessing_utility.LoggingPool(processes=self.configs.pool_size) all_results = pool.map(_run_experiment, args) else: all_results = [_run_experiment(a) for a in args] for curr_results,s in zip(all_results,num_labels_list): if curr_results is None: continue i_labels, split = s method_results.set(curr_results, i_labels, split) method_results.configs = self.configs if self.configs.should_load_temp_data: helper_functions.save_object(results_file,method_results) for i_labels, split in num_labels_list: num_labels = self.configs.num_labels[i_labels] _delete_temp_split_files(results_file, num_labels, split) _delete_temp_folder(results_file)
def create_synthetic_classification(file_dir='', local=True): dim = 1 n_target = 200 n_source = 200 n = n_target + n_source data = data_class.Data() data.x = np.random.uniform(0, 1, (n, dim)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.zeros(n) x, ids = data.x, data.data_set_ids I = array_functions.in_range(x, 0, .25) I2 = array_functions.in_range(x, .25, .5) I3 = array_functions.in_range(x, .5, .75) I4 = array_functions.in_range(x, .75, 1) id0 = ids == 0 id1 = ids == 1 data.y[I & id0] = 1 data.y[I2 & id0] = 2 data.y[I3 & id0] = 1 data.y[I4 & id0] = 2 data.y[I & id1] = 3 data.y[I2 & id1] = 4 data.y[I3 & id1] = 3 data.y[I4 & id1] = 4 if local: data.y[I3 & id1] = 4 data.y[I4 & id1] = 3 data.set_true_y() data.set_train() data.is_regression = False noise_rate = 0 #data.add_noise(noise_rate) data.add_noise(noise_rate, id0, np.asarray([1, 2])) data.add_noise(noise_rate, id1, np.asarray([3, 4])) s = synthetic_classification_file if local: s = synthetic_classification_local_file i = id1 array_functions.plot_2d(data.x[i, :], data.y[i]) if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def create_synthetic_classification(file_dir="", local=True): dim = 1 n_target = 200 n_source = 200 n = n_target + n_source data = data_class.Data() data.x = np.random.uniform(0, 1, (n, dim)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.zeros(n) x, ids = data.x, data.data_set_ids I = array_functions.in_range(x, 0, 0.25) I2 = array_functions.in_range(x, 0.25, 0.5) I3 = array_functions.in_range(x, 0.5, 0.75) I4 = array_functions.in_range(x, 0.75, 1) id0 = ids == 0 id1 = ids == 1 data.y[I & id0] = 1 data.y[I2 & id0] = 2 data.y[I3 & id0] = 1 data.y[I4 & id0] = 2 data.y[I & id1] = 3 data.y[I2 & id1] = 4 data.y[I3 & id1] = 3 data.y[I4 & id1] = 4 if local: data.y[I3 & id1] = 4 data.y[I4 & id1] = 3 data.set_true_y() data.set_train() data.is_regression = False noise_rate = 0 # data.add_noise(noise_rate) data.add_noise(noise_rate, id0, np.asarray([1, 2])) data.add_noise(noise_rate, id1, np.asarray([3, 4])) s = synthetic_classification_file if local: s = synthetic_classification_local_file i = id1 array_functions.plot_2d(data.x[i, :], data.y[i]) if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_synthetic_linear_regression(n=500, p=50, sigma=1, num_non_zero=None): data = data_class.Data() data.x = np.random.uniform(0, sigma, (n, p)) w = np.random.normal(0, 1, p) # w = np.ones(p) if num_non_zero is not None: w[num_non_zero:] = 0 data.y = data.x.dot(w) data.y += np.random.normal(0, sigma, n) data.is_regression = True data.set_true_y() data.set_train() suffix = str(n) + "-" + str(p) + "-" + str(sigma) data.metadata = dict() data.metadata["true_w"] = w.T if num_non_zero is not None: suffix += "-nnz=" + str(num_non_zero) s = synthetic_linear_reg_file % suffix helper_functions.save_object(s, data)
def create_synthetic_step_linear_transfer(file_dir=""): n_target = 100 n_source = 100 n = n_target + n_source sigma = 0.5 data = data_class.Data() data.x = np.random.uniform(0, 1, (n, 1)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.reshape(data.x * 5, data.x.shape[0]) data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= 0.5)] += 4 data.y += np.random.normal(0, sigma, n) data.set_defaults() data.is_regression = True array_functions.plot_2d(data.x, data.y, data.data_set_ids, title="Linear Step Data Set") s = synthetic_step_linear_transfer_file if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=.3): wt = np.random.normal(0, sigma, p) all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt) x = all_data.x all_data.data_set_ids = np.zeros(n) wt = w_eff data_set_counter = 1 diffs = [] is_target = array_functions.false(kt + ks) is_target[:kt] = True all_data.true_w = np.zeros((ks + kt + 1, p)) all_data.true_w[0, :] = wt for i, val in enumerate(is_target): data_set_id = data_set_counter data_set_counter += 1 if val: ws = wt + np.random.normal(0, sigma_s, p) ws = wt else: ws = np.random.normal(0, sigma, p) source_data, ws = create_synthetic_linear_classification(w=ws, x=x) source_data.data_set_ids = data_set_id * np.ones(n) #source_data.true_y *= (i+2) source_data.y = source_data.true_y all_data.combine(source_data) diff = norm(wt / norm(wt) - ws / norm(ws)) diffs.append(diff) all_data.true_w[data_set_id, :] = ws all_data.true_w = all_data.true_w.T all_data.metadata = dict() all_data.metadata['true_w'] = all_data.true_w s = synthetic_hypothesis_transfer_class_file % \ (str(n) + '-' + str(p) + '-' + str(sigma) + '-' + str(sigma_s) + '-' + str(kt) + '-' + str(ks)) helper_functions.save_object(s, all_data)
def create_drought(label_to_use=0, series_to_use=0, num_instances=None, normalize_x=False, save_data=True): file = 'drought/processed_data.pkl' y, ids = helper_functions.load_object(file) y_to_use = y[:, series_to_use, :] print str(series_to_use) + ': ' + ids[series_to_use] data = data_class.TimeSeriesData(y_to_use, np.asarray([ids[series_to_use]])) data.is_regression = True data.keep_series(label_to_use) data = data.get_min_range() data.smooth_missing() data.x = data.x.astype(np.float) if num_instances is not None: data = data.get_range([0, num_instances]) if normalize_x: data.x -= data.x.min() data.x /= data.x.max() data = data.create_data_instance() # perc_used = data.get_perc_used() if num_instances is not None: pass s = 'drought-%d-%d' % (series_to_use, num_instances) else: s = 'drought-%d' % series_to_use if normalize_x: s += '-norm' s += '/raw_data.pkl' # array_functions.plot_2d_sub_multiple_y(data.x, data.y, title=None, sizes=10) array_functions.plot_2d_sub(data.x, data.y, data_set_ids=data.data_set_ids, title=None, sizes=10) if save_data: helper_functions.save_object(s, data)
def create_synthetic_step_transfer(file_dir='', dim=1): n_target = 100 n_source = 100 n = n_target + n_source sigma = .5 data = data_class.Data() data.x = np.random.uniform(0, 1, (n, dim)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.zeros(n) data.y[(data.data_set_ids == 0) & (data.x[:, 0] >= .5)] = 2 data.y += np.random.normal(0, sigma, n) data.set_defaults() data.is_regression = True if dim == 1: array_functions.plot_2d(data.x, data.y, data.data_set_ids) s = synthetic_step_transfer_file if dim > 1: s = synthetic_step_kd_transfer_file % dim if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def create_synthetic_linear_regression(n=500, p=50, sigma=1, num_non_zero=None): data = data_class.Data() data.x = np.random.uniform(0, sigma, (n, p)) w = np.random.normal(0, 1, p) #w = np.ones(p) if num_non_zero is not None: w[num_non_zero:] = 0 data.y = data.x.dot(w) data.y += np.random.normal(0, sigma, n) data.is_regression = True data.set_true_y() data.set_train() suffix = str(n) + '-' + str(p) + '-' + str(sigma) data.metadata = dict() data.metadata['true_w'] = w.T if num_non_zero is not None: suffix += '-nnz=' + str(num_non_zero) s = synthetic_linear_reg_file % suffix helper_functions.save_object(s, data)
def create_synthetic_step_linear_transfer(file_dir=''): n_target = 100 n_source = 100 n = n_target + n_source sigma = .5 data = data_class.Data() data.x = np.random.uniform(0, 1, (n, 1)) data.data_set_ids = np.zeros(n) data.data_set_ids[n_target:] = 1 data.y = np.reshape(data.x * 5, data.x.shape[0]) data.y[(data.data_set_ids == 1) & (data.x[:, 0] >= .5)] += 4 data.y += np.random.normal(0, sigma, n) data.set_defaults() data.is_regression = True array_functions.plot_2d(data.x, data.y, data.data_set_ids, title='Linear Step Data Set') s = synthetic_step_linear_transfer_file if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def create_boston_housing(file_dir=""): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y if create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) # viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % "" elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ("-" + str(p)) else: assert False else: s %= "" if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_bike_sharing(): file = 'bike_sharing/day.csv' columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype='string') all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=',', usecols=columns) domain_ind = used_field_names == 'yr' domain_ids = np.squeeze(bike_data[:, domain_ind]) #inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') #bike_data = bike_data[:,inds_to_keep] #used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: #learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ('-feat=' + str(field_to_use)) helper_functions.save_object(s, data) pass
def create_bike_sharing(): file = "bike_sharing/day.csv" columns = [0] + range(2, 16) all_field_names = pd.read_csv(file, nrows=1, dtype="string") all_field_names = np.asarray(all_field_names.keys()) used_field_names = all_field_names[columns] bike_data = np.loadtxt(file, skiprows=1, delimiter=",", usecols=columns) domain_ind = used_field_names == "yr" domain_ids = np.squeeze(bike_data[:, domain_ind]) # inds_to_keep = (used_field_names == 'temp') | (used_field_names == 'atemp') # bike_data = bike_data[:,inds_to_keep] # used_field_names = used_field_names[inds_to_keep] viz = True to_use = np.asarray([8, 9, 10, 11]) x = bike_data[:, to_use] used_field_names = used_field_names[to_use] y = bike_data[:, -1] if viz: # learner = make_learner() learner = None viz_features(x, y, domain_ids, used_field_names, learner=learner) field_to_use = 1 x = x[:, field_to_use] data = data_class.Data() data.is_regression = True data.x = array_functions.vec_to_2d(x) data.x = array_functions.standardize(data.x) data.y = y data.y = array_functions.normalize(data.y) data.set_defaults() data.data_set_ids = domain_ids s = bike_file % ("-feat=" + str(field_to_use)) helper_functions.save_object(s, data) pass
pl.title('Values 2') array_functions.move_fig(fig1, 500, 500, 2000, 100) array_functions.move_fig(fig2, 500, 500, 2600, 100) pl.show(block=True) data = (x, y) x = np.vstack((x[I1, :], x[I2, :])) data_set_ids = np.hstack((np.zeros(I1.sum()), np.ones(I2.sum()))) y = np.hstack((y[I1], y[I2])) data = data_lib.Data(x, y) data.x[:, 0] = array_functions.normalize(data.x[:, 0]) data.x[:, 1] = array_functions.normalize(data.x[:, 1]) data.data_set_ids = data_set_ids print 'n-all: ' + str(data.y.size) if save_data: s = '../kc-housing-spatial' if suffix != '': s += '-' + suffix helper_functions.save_object(s + '/raw_data.pkl', data) else: feats_to_clear = ['id', 'date', 'yr_renovated', 'zipcode', 'lat', 'long'] clear_idx = array_functions.find_set(feat_names, feats_to_clear + [y_name]) x = data[:, ~clear_idx] x = array_functions.remove_quotes(x) x = x.astype(np.float) data = (x, y) helper_functions.save_object('processed_data.pkl', data)
x = np.vstack((day_locs, night_locs)) ''' for i in range(x.shape[1]): x[:,i] = x[:,i] / x[:,i].max() ''' data_set_ids = np.hstack( (np.zeros(day_values.size), np.ones(day_values.size))) y = np.hstack((day_values, night_values)) ''' if use_alternate: I = np.isfinite(y) & (y > 0) else: I = np.isfinite(y) & (y > 0) & (y > np.log(5)) ''' #I[~np.isfinite(y)] = 0 I = np.isfinite(y) I &= array_functions.in_range(y, min_value, max_value) if just_center_data: I = I & in_range(x[:, 0], .2, .8) & in_range(x[:, 1], .2, .8) data = data_lib.Data(x[I, :], y[I]) data.data_set_ids = data_set_ids[I] print 'n: ' + str(data.n) print 'n0: ' + str((data.data_set_ids == 0).sum()) print 'n1: ' + str((data.data_set_ids == 1).sum()) if save_data: pass file_path = '../taxi%s/raw_data.pkl' % suffix helper_functions.save_object(file_path, data) print ''
def load_taxi_data(num_files_to_load=np.inf, num_bins=50, use_alternate=True, return_coords=False): all_files = [ f for f in os.listdir(data_dir) if path.isfile(path.join(data_dir, f)) ] x = [] y = [] time = [] has_passenger = [] #combined_data_file = 'combined_data.pkl' combined_data_file = 'C:/PythonFramework/data_sets/taxi/combined_data.pkl' if path.exists(combined_data_file): print 'loading combined data...' all_data = helper_functions.load_object(combined_data_file) print 'done loading data' else: for i, file in enumerate(all_files): if i == num_files_to_load: break if i >= 535: break file_data = load_csv(path.join(data_dir, file), has_field_names=False, delim=str(' '))[1] y.append(file_data[:, 0]) x.append(file_data[:, 1]) has_passenger.append(file_data[:, 2]) time.append(file_data[:, 3]) print i all_data = { 'x': x, 'y': y, 'has_passenger': has_passenger, 'time': time } print 'saving combined data...' helper_functions.save_object(combined_data_file, all_data) x = all_data['x'] y = all_data['y'] has_passenger = all_data['has_passenger'] time = all_data['time'] x_all = np.concatenate(x) y_all = np.concatenate(y) time_all = np.concatenate(time) has_passenger_all = np.concatenate(has_passenger) pickup_inds = get_pickup_inds(x_all, y_all, time_all, has_passenger_all) if just_pickup: x_all = x_all[pickup_inds] y_all = y_all[pickup_inds] has_passenger_all = has_passenger_all[pickup_inds] time_all = time_all[pickup_inds] #x_bounds = [-122.45677419354838, -122.38322580645161] #y_bounds = [37.738054968287521, 37.816543340380548] x_bounds = [-122.48, -122.35] y_bounds = [37.7, 37.84] #x_bounds = [-np.inf, np.inf] #y_bounds = x_bounds is_in_range = in_range(x_all, *x_bounds) & in_range(y_all, *y_bounds) x_all = x_all[is_in_range] y_all = y_all[is_in_range] x_all = quantize_loc(x_all, num_bins) y_all = quantize_loc(y_all, num_bins) time_all = time_all[is_in_range] hours = 9 * np.ones(time_all.shape) get_hour_vec = np.vectorize(get_hour) hours = get_hour_vec(time_all) ''' get_day_vec = np.vectorize(get_day) days = get_day_vec(time_all) ''' has_passenger_all = has_passenger_all[is_in_range] suffix = '3' is_morning = (hours == 9) is_night = (hours == 18) #is_morning = (hours == 6) & (days == 21) #is_night = (hours == 18) & (days == 21) #is_morning = (days == 21) #is_night = (days == 24) if use_alternate: is_morning = (hours >= 5) & (hours <= 12) is_night = (hours >= 17) #is_morning = days == 21 #is_night = days == 24 #is_morning = (has_passenger_all == 1) & (days == 21) & is_morning #is_night = (has_passenger_all == 1) & (days == 21) & is_night #is_morning = (has_passenger_all == 1) & (hours == 6) #is_night = (has_passenger_all == 1) & (hours == 18) suffix = '2' suffix += '-' + str(num_bins) #print np.unique(days) #is_morning = days == 4 #is_night = days == 8 day_locs, day_values = count_cars(x_all[is_morning], y_all[is_morning], num_bins) night_locs, night_values = count_cars(x_all[is_night], y_all[is_night], num_bins) if return_coords: day_locs = bin_to_coordinates(day_locs, x_bounds, y_bounds, num_bins) night_locs = bin_to_coordinates(night_locs, x_bounds, y_bounds, num_bins) ''' if use_alternate: I = (day_values > 0) | (night_values > 0) I = I & (day_values > 0) & (night_values > 0) else: I = (day_values > 5) | (night_values > 5) I = I & (day_values > 0) & (night_values > 0) relative_diff = np.max(day_values[I] - night_values[I]) / day_values[I] ''' #array_functions.plot_heatmap(day_locs[I], relative_diff, sizes=10, alpha=1, subtract_min=False) return day_locs, day_values, night_locs, night_values, suffix
for i, s in enumerate(unique_series_ids[is_in_state]): print str(i) + ': ' + s array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10) else: for i in range(times_series_vals.shape[1]): y_val = times_series_vals[:, i, :] x_val = np.arange(y_val.shape[0]) if not np.isfinite(y_val).sum(0).all(): print 'skipping - missing labels' continue print unique_series_ids[i] array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=unique_series_ids[i], sizes=10) data = (unique_locs, times_series_vals[:, :, y_to_use], unique_series_ids) suffix = str(y_names[y_to_use]) if use_monthly: suffix += '-month' s = '../climate' if use_monthly: s += '-month' s += '/processed_data-' + suffix + '.pkl' helper_functions.save_object(s, data) pass
def create_synthetic_slant_transfer(): target_fun = lambda x: 2 * x source_fun = lambda x: 2.5 * x + 1 data = create_synthetic_regression_transfer(target_fun, source_fun) s = synthetic_slant_file helper_functions.save_object(s, data)
def create_synthetic_curve_transfer(): target_fun = lambda x: x**2 source_fun = lambda x: x**2.5 + 1 data = create_synthetic_regression_transfer(target_fun, source_fun) s = synthetic_curve_file helper_functions.save_object(s, data)
''' for j in I: print date_strs[j] ''' print 'num_items: ' + str(I.size) print 'start: ' + date_strs[I[0]] print 'end: ' + date_strs[I[-1]] times_series_vals[times_series_vals < 0] = np.nan ''' #for state in unique_states: for state in unique_series_ids: is_in_state = np.asarray([s.find(state) == 0 for s in unique_series_ids]) is_in_state = is_in_state.nonzero()[0] if is_in_state.size > 8: is_in_state = is_in_state[:8] #y_val = times_series_vals[is_in_state, :800, 1].T y_val = times_series_vals[is_in_state[0], :2000, :4] x_val = range(y_val.shape[0]) #print unique_series_ids[to_use] for i, s in enumerate(unique_series_ids[is_in_state]): print str(i) + ': ' + s array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10) ''' data = (times_series_vals,unique_series_ids) helper_functions.save_object('processed_data.pkl', data) pass
def create_20ng_data(file_dir=''): newsgroups_train = datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) data = data_class.Data() short_names = [ #0 'A', #1-5 'C1', 'C2', 'C3', 'C4', 'C5', #6 'M', #7-10 'R1', 'R2', 'R3', 'R4', #11-14 'S1', 'S2', 'S3', 'S4', #15 'O', #16-19 'T1', 'T2', 'T3', 'T4' ] y = newsgroups_train.target #l = [1,2,7,8,12,17] #l = [1,2,7,8,12,13] #l = [0,1,2,3,4,5,7,8,9,10,11,12,13,14,16,17,18,19] l = [0, 1, 2, 7, 8, 11, 12, 16, 17] #l = [0, 1, 2, 3, 4, 7, 8, 9, 10,11,12,13,14,16,17,18,19] data.label_names = [short_names[i] for i in l] I = array_functions.false(len(newsgroups_train.target)) for i in l: I = I | (y == i) #I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16 I = I.nonzero()[0] max_df = .5 min_df = .01 #max_df = .95 #min_df = .001 #max_df = .1 #min_df = .01 newsgroups_train.data = [newsgroups_train.data[i] for i in I] newsgroups_train.target = newsgroups_train.target[I] tf_idf = TfidfVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=max_features) vectors = tf_idf.fit_transform(newsgroups_train.data) feature_counts = (vectors > 0).sum(0) vocab = helper_functions.invert_dict(tf_idf.vocabulary_) num_feats = len(vocab) vocab = [vocab[i] for i in range(num_feats)] #pca = PCA(n_components=pca_feats) #v2 = pca.fit_transform(vectors.toarray()) v2 = vectors.toarray() vectors = v2 y = newsgroups_train.target.copy() ''' y[y==7] = 1 y[(y==2) | (y==8)] = 2 y[(y==12) | (y==17)] = 3 ''' ''' y[y == 2] = 1 y[(y==7) | (y==8)] = 2 y[(y==12) | (y==13)] = 3 #I_f = (y==1) | (y==7) | (y==11) | (y==16) I_f = array_functions.true(vectors.shape[0]) f = f_classif k_best = SelectKBest(score_func=f, k=pca_feats) v2 = k_best.fit_transform(vectors[I_f,:], y[I_f]) k_best.transform(vectors) s = k_best.get_support() selected_vocab = [vocab[i] for i in s.nonzero()[0]] vocab = selected_vocab vectors = v2 ''' data.x = vectors data.y = newsgroups_train.target data.set_train() data.set_target() data.set_true_y() data.is_regression = False data.feature_names = vocab class_counts = array_functions.histogram_unique(data.y) s = ng_raw_data_file if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def create_20ng_data(file_dir=""): newsgroups_train = datasets.fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes")) data = data_class.Data() short_names = [ # 0 "A", # 1-5 "C1", "C2", "C3", "C4", "C5", # 6 "M", # 7-10 "R1", "R2", "R3", "R4", # 11-14 "S1", "S2", "S3", "S4", # 15 "O", # 16-19 "T1", "T2", "T3", "T4", ] data.label_names = short_names y = newsgroups_train.target l = [1, 2, 7, 8, 12, 17] # l = [1,2,7,8,12,13] I = array_functions.false(len(newsgroups_train.target)) for i in l: I = I | (y == i) # I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16 I = I.nonzero()[0] max_df = 0.95 min_df = 0.001 # max_df = .1 # min_df = .01 newsgroups_train.data = [newsgroups_train.data[i] for i in I] newsgroups_train.target = newsgroups_train.target[I] tf_idf = TfidfVectorizer(stop_words="english", max_df=max_df, min_df=min_df, max_features=max_features) vectors = tf_idf.fit_transform(newsgroups_train.data) feature_counts = (vectors > 0).sum(0) vocab = helper_functions.invert_dict(tf_idf.vocabulary_) num_feats = len(vocab) vocab = [vocab[i] for i in range(num_feats)] pca = PCA(n_components=pca_feats) v2 = pca.fit_transform(vectors.toarray()) vectors = v2 y = newsgroups_train.target.copy() """ y[y==7] = 1 y[(y==2) | (y==8)] = 2 y[(y==12) | (y==17)] = 3 """ """ y[y == 2] = 1 y[(y==7) | (y==8)] = 2 y[(y==12) | (y==13)] = 3 #I_f = (y==1) | (y==7) | (y==11) | (y==16) I_f = array_functions.true(vectors.shape[0]) f = f_classif k_best = SelectKBest(score_func=f, k=pca_feats) v2 = k_best.fit_transform(vectors[I_f,:], y[I_f]) k_best.transform(vectors) s = k_best.get_support() selected_vocab = [vocab[i] for i in s.nonzero()[0]] vocab = selected_vocab vectors = v2 """ data.x = vectors data.y = newsgroups_train.target data.set_defaults() data.is_regression = False data.feature_names = vocab class_counts = array_functions.histogram_unique(data.y) s = ng_raw_data_file if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_wine(data_to_create=WINE_RED): red_file = 'wine/winequality-red.csv' white_file = 'wine/winequality-white.csv' field_names, red_data = load_csv(red_file, delim=';') white_data = load_csv(white_file, delim=';')[1] if data_to_create == WINE_TRANSFER: red_ids = np.zeros((red_data.shape[0], 1)) white_ids = np.ones((white_data.shape[0], 1)) red_data = np.hstack((red_data, red_ids)) white_data = np.hstack((white_data, white_ids)) wine_data = np.vstack((red_data, white_data)) ids = wine_data[:, -1] x = wine_data[:, :-2] y = wine_data[:, -2] used_field_names = field_names[:-1] viz = True if viz: learner = make_learner() #learner = None viz_features(x, y, ids, used_field_names, alpha=.01, learner=learner) suffix = 'transfer' else: if data_to_create == WINE_RED: wine_data = red_data suffix = 'red' elif data_to_create == WINE_WHITE: wine_data = white_data suffix = 'white' else: assert False ids = None x = wine_data[:, :-1] y = wine_data[:, -1] used_field_names = field_names[:-1] data = data_class.Data() data.x = data.x = array_functions.standardize(x) if data_to_create == WINE_TRANSFER: pass #feat_idx = 1 #data.x = array_functions.vec_to_2d(x[:,feat_idx]) data.y = y data.set_train() data.set_target() data.set_true_y() data.data_set_ids = ids data.is_regression = True ''' data = data.rand_sample(.25, data.data_set_ids == 0) data = data.rand_sample(.1, data.data_set_ids == 1) s = wine_file % ('-small-' + str(data.p)) ''' s = wine_file % ('-' + suffix) helper_functions.save_object(s, data)
def run_main(): import caffe adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\' age_net_pretrained = '/age_net.caffemodel' age_net_model_file = '/deploy_age.prototxt' age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file, adience_caffe_model_dir + age_net_pretrained, channel_swap=(2, 1, 0), raw_scale=255, image_dims=(256, 256)) age_list = [ '(0, 2)', '(4, 6)', '(8, 12)', '(15, 20)', '(25, 32)', '(38, 43)', '(48, 53)', '(60, 100)' ] adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\' adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv' metadata = create_data_set.load_csv( adience_metadata_file, dtype='string', delim='\t', ) column_names = metadata[0].tolist() photo_data = metadata[1] face_id_col = column_names.index('face_id') user_id_col = column_names.index('user_id') image_name_col = column_names.index('original_image') age_col = column_names.index('age') x = np.zeros((photo_data.shape[0], 512)) y = np.zeros((photo_data.shape[0])) id = np.zeros((photo_data.shape[0])) i = 0 last_perc_done = 0 for idx, row in enumerate(photo_data): perc_done = math.floor(100 * float(idx) / len(photo_data)) if perc_done > last_perc_done: last_perc_done = perc_done print str(perc_done) + '% done' image_dir = adience_image_dir + row[user_id_col] + '/' face_id = row[face_id_col] ''' images_in_dir = os.listdir(image_dir) matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0] assert len(matching_images) < 2 if len(matching_images) == 0: print 'Skipping: ' + image continue ''' image = image_dir + 'landmark_aligned_face.' + str( face_id) + '.' + row[image_name_col] if not os.path.isfile(image): print 'Skipping: ' + image continue input_image = caffe.io.load_image(image) age = row[age_col] blobs = ['fc7'] features_age = predict_blobs(age_net, [input_image], blobs) x[i, :] = features_age y[i] = extract_age(age) id[i] = float(face_id) i += 1 data = data_class.Data() data.x = x data.instance_ids = id data.y = y data.is_regression = True data.set_train() data.set_target() data.set_true_y() data_file = create_data_set.adience_aligned_cnn_file helper_functions.save_object('data_sets/' + data_file, data) print 'TODO'
def create_boston_housing(file_dir=''): boston_data = datasets.load_boston() data = data_class.Data() data.x = boston_data.data data.y = boston_data.target data.feature_names = list(boston_data.feature_names) data.set_train() data.set_target() data.set_true_y() data.is_regression = True s = boston_housing_raw_data_file x = data.x y = data.y create_transfer_data = False create_y_split = True if create_y_split: from base import transfer_project_configs as configs_lib pc = configs_lib.ProjectConfigs() main_configs = configs_lib.MainConfigs(pc) learner = main_configs.learner learner.quiet = True learner.target_learner[0].quiet = True learner.source_learner.quiet = True learner.g_learner.quiet = False domain_ids = array_functions.bin_data(data.y, num_bins=2) data.data_set_ids = domain_ids data.is_train[:] = True corrs = [] for i in range(x.shape[1]): corrs.append(scipy.stats.pearsonr(x[:, i], y)[0]) learner.train_and_test(data) print 'Just playing with data - not meant to save it' for i, name in enumerate(data.feature_names): v = learner.g_learner.g[i] if abs(v) < 1e-6: v = 0 print name + ': ' + str(v) exit() elif create_transfer_data: x_ind = 5 domain_ind = 12 domain_ids = np.ones(x.shape[0]) domain_ids = array_functions.bin_data(x[:, domain_ind], num_bins=4) x = np.delete(x, domain_ind, 1) #viz_features(x,y,domain_ids,boston_data.feature_names) data.data_set_ids = domain_ids if boston_num_feats == 1: data.x = data.x[:, x_ind] data.x = array_functions.vec_to_2d(data.x) s = s % '' elif boston_num_feats >= data.x.shape[1]: data.x = array_functions.standardize(data.x) p = min(boston_num_feats, data.x.shape[1]) s = s % ('-' + str(p)) else: assert False else: s %= '' if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
def run_main(): import caffe adience_caffe_model_dir = 'C:\\Users\\Aubrey\\Desktop\\cnn_age_gender_models_and_data.0.0.2\\' age_net_pretrained='/age_net.caffemodel' age_net_model_file='/deploy_age.prototxt' age_net = caffe.Classifier(adience_caffe_model_dir + age_net_model_file, adience_caffe_model_dir + age_net_pretrained, channel_swap=(2,1,0), raw_scale=255, image_dims=(256, 256)) age_list=['(0, 2)','(4, 6)','(8, 12)','(15, 20)','(25, 32)','(38, 43)','(48, 53)','(60, 100)'] adience_image_dir = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\aligned\\' adience_metadata_file = 'C:\\Users\\Aubrey\\Desktop\\adience_aligned\\alined_metadata\\all_photos.csv' metadata = create_data_set.load_csv(adience_metadata_file, dtype='string', delim='\t', ) column_names = metadata[0].tolist() photo_data = metadata[1] face_id_col = column_names.index('face_id') user_id_col = column_names.index('user_id') image_name_col = column_names.index('original_image') age_col = column_names.index('age') x = np.zeros((photo_data.shape[0], 512)) y = np.zeros((photo_data.shape[0])) id = np.zeros((photo_data.shape[0])) i = 0 last_perc_done = 0 for idx, row in enumerate(photo_data): perc_done = math.floor(100 * float(idx) / len(photo_data)) if perc_done > last_perc_done: last_perc_done = perc_done print str(perc_done) + '% done' image_dir = adience_image_dir + row[user_id_col] + '/' face_id = row[face_id_col] ''' images_in_dir = os.listdir(image_dir) matching_images = [s for s in images_in_dir if s.find(row[image_name_col]) >= 0] assert len(matching_images) < 2 if len(matching_images) == 0: print 'Skipping: ' + image continue ''' image = image_dir + 'landmark_aligned_face.' + str(face_id) + '.' + row[image_name_col] if not os.path.isfile(image): print 'Skipping: ' + image continue input_image = caffe.io.load_image(image) age = row[age_col] blobs = ['fc7'] features_age = predict_blobs(age_net,[input_image],blobs) x[i,:] = features_age y[i] = extract_age(age) id[i] = float(face_id) i += 1 data = data_class.Data() data.x = x data.instance_ids = id data.y = y data.is_regression = True data.set_train() data.set_target() data.set_true_y() data_file = create_data_set.adience_aligned_cnn_file helper_functions.save_object('data_sets/' + data_file, data) print 'TODO'
def create_synthetic_curve_transfer(): target_fun = lambda x: x ** 2 source_fun = lambda x: x ** 2.5 + 1 data = create_synthetic_regression_transfer(target_fun, source_fun) s = synthetic_curve_file helper_functions.save_object(s, data)
import numpy as np import scipy from data_sets import create_data_set from data import data as data_lib from utility import helper_functions file = 'SAheart.data.txt' all_field_names, data = create_data_set.load_csv(file, has_field_names=True,dtype='string',delim=str(',')) data[data == 'Present'] = '1' data[data == 'Absent'] = '0' data = data[:, 1:] data = data.astype(np.float) data = data_lib.Data(data[:, :-1], data[:, -1]) data.set_train() data.set_target() helper_functions.save_object('raw_data.pkl', data) print ''
is_in_state = np.arange(i,i+10) #y_val = times_series_vals[is_in_state, :800, 1].T y_val = times_series_vals[:,is_in_state[:], y_to_plot] x_val = range(y_val.shape[0]) #print unique_series_ids[to_use] for i, s in enumerate(unique_series_ids[is_in_state]): print str(i) + ': ' + s array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=None, sizes=10) else: for i in range(times_series_vals.shape[1]): y_val = times_series_vals[:, i, :] x_val = np.arange(y_val.shape[0]) if not np.isfinite(y_val).sum(0).all(): print 'skipping - missing labels' continue print unique_series_ids[i] array_functions.plot_2d_sub_multiple_y(np.asarray(x_val), y_val, title=unique_series_ids[i], sizes=10) data = (unique_locs, times_series_vals[:,:,y_to_use],unique_series_ids) suffix = y_names[y_to_use] if use_monthly: suffix += '-month' s = '../climate' if use_monthly: s += '-month' s += '/processed_data.pkl' helper_functions.save_object(s, data) pass