def optimize(self, opt_data): #self.sigma_p = 1 #self.sigma_y = 1 #self.C = 1 assert (opt_data.instances_to_keep is None or opt_data.instances_to_keep.sum() == 0), 'Not implemented yet!' W_p = density.compute_kernel(opt_data.X, None, self.sigma_p) W_y = array_functions.make_rbf(opt_data.X, self.sigma_y) n = W_p.shape[0] selected = array_functions.false(n) y_true = self.f_x p_true = self.p_x for i in range(opt_data.subset_size): new_scores = np.zeros(n) new_scores[:] = np.inf for j in range(n): if selected[j]: continue b = array_functions.false(n) b[j] = True new_scores[j] = self.evaluate_selection(W_p, W_y, b | selected, y_true, p_true) best_idx = new_scores.argmin() selected[best_idx] = True self.selected = selected if selected.sum() < opt_data.subset_size: # print 'Empty clusters' pass # self.learned_distribution = compute_p(selected, opt_data) self.learned_distribution = selected self.optimization_value = 0
def optimize_for_data(self, W, num_to_select): selected = array_functions.false(W.shape[0]) for i in range(num_to_select): new_scores = np.zeros(W.shape[0]) new_scores[:] = -np.inf for j in range(W.shape[0]): if selected[j]: continue b = array_functions.false(W.shape[0]) b[j] = True new_scores[j] = self.evaluate_selection(W, selected | b) best_idx = new_scores.argmax() selected[best_idx] = True return selected
def optimize(self, opt_data): assert opt_data.instances_to_keep is None, 'Not implemented yet!' W_x = array_functions.make_rbf(opt_data.X, self.sigma_x) W = W_x if not self.no_f_x: W_y = array_functions.make_rbf(opt_data.Y, self.sigma_y) W = W_x * W_y n = W.shape[0] selected = array_functions.false(W.shape[0]) splits = [array_functions.true(n)] num_per_split = [opt_data.subset_size] if self.num_class_splits is not None: assert self.num_class_splits == 2 I1 = opt_data.Y <= opt_data.Y.mean() splits = [I1, ~I1] num_per_split = [opt_data.subset_size/2, opt_data.subset_size/2] for split, num in zip(splits, num_per_split): W_split = W[np.ix_(split, split)] split_selections = self.optimize_for_data(W_split, num) split_inds = split.nonzero()[0] selected[split_inds[split_selections]] = True #selected = self.compute_centroids_for_spectral_clustering(W, cluster_inds) self.W = W self.selected = selected if selected.sum() < opt_data.subset_size: #print 'Empty clusters' pass #self.learned_distribution = compute_p(selected, opt_data) self.learned_distribution = selected self.optimization_value = 0
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=0.3): wt = np.random.normal(0, sigma, p) all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt) x = all_data.x all_data.data_set_ids = np.zeros(n) wt = w_eff data_set_counter = 1 diffs = [] is_target = array_functions.false(kt + ks) is_target[:kt] = True all_data.true_w = np.zeros((ks + kt + 1, p)) all_data.true_w[0, :] = wt for i, val in enumerate(is_target): data_set_id = data_set_counter data_set_counter += 1 if val: ws = wt + np.random.normal(0, sigma_s, p) ws = wt else: ws = np.random.normal(0, sigma, p) source_data, ws = create_synthetic_linear_classification(w=ws, x=x) source_data.data_set_ids = data_set_id * np.ones(n) # source_data.true_y *= (i+2) source_data.y = source_data.true_y all_data.combine(source_data) diff = norm(wt / norm(wt) - ws / norm(ws)) diffs.append(diff) all_data.true_w[data_set_id, :] = ws all_data.true_w = all_data.true_w.T all_data.metadata = dict() all_data.metadata["true_w"] = all_data.true_w s = synthetic_hypothesis_transfer_class_file % ( str(n) + "-" + str(p) + "-" + str(sigma) + "-" + str(sigma_s) + "-" + str(kt) + "-" + str(ks) ) helper_functions.save_object(s, all_data)
def split_data(file, configs): data = helper_functions.load_object(file) splitter = DataSplitter() splitData = data_lib.SplitData() splitData.data = data num_splits = 30 perc_train = .8 keep_for_splitting = None if configs.split_data_set_ids is not None: keep_for_splitting = array_functions.false(data.n) keep_for_splitting[data.data_set_ids == 0] = True #Pretend data_set_ids is a label vector to ensure each data set is split equally if data.is_regression and data.data_set_ids is not None: assert len(data.data_set_ids) == data.n is_regression = False splitData.splits = splitter.generate_splits( data.data_set_ids, num_splits, perc_train, is_regression, keep_for_splitting ) else: splitData.splits = splitter.generate_splits( data.y, num_splits, perc_train, data.is_regression, keep_for_splitting ) splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep split_dir = os.path.dirname(file) save_file = split_dir + '/split_data.pkl' helper_functions.save_object(save_file,splitData) return splitData
def get_min_range(self): ranges = self.get_series_range() min_range = ranges[:,0].max() max_range = ranges[:,1].min() I = array_functions.false(self.n) I[min_range:max_range+1] = True return self.get_subset(I)
def get_min_range(self): ranges = self.get_series_range() min_range = ranges[:, 0].max() max_range = ranges[:, 1].min() I = array_functions.false(self.n) I[min_range:max_range + 1] = True return self.get_subset(I)
def split_data(file, configs): data = helper_functions.load_object(file) data.is_regression = configs.is_regression splitter = DataSplitter() splitData = data_lib.SplitData() splitData.data = data num_splits = 30 perc_train = .8 keep_for_splitting = None if configs.split_data_set_ids is not None: keep_for_splitting = array_functions.false(data.n) keep_for_splitting[data.data_set_ids == 0] = True #Pretend data_set_ids is a label vector to ensure each data set is split equally if data.is_regression and data.data_set_ids is not None: assert len(data.data_set_ids) == data.n is_regression = False splitData.splits = splitter.generate_splits(data.data_set_ids, num_splits, perc_train, is_regression, keep_for_splitting) else: splitData.splits = splitter.generate_splits(data.y, num_splits, perc_train, data.is_regression, keep_for_splitting) splitData.data_set_ids_to_keep = configs.data_set_ids_to_keep split_dir = os.path.dirname(file) save_file = split_dir + '/split_data.pkl' helper_functions.save_object(save_file, splitData) return splitData
def keep_subset(I, num_to_keep): inds = I.nonzero()[0] if num_to_keep > inds.size: return I inds_to_keep = np.random.choice(inds, num_to_keep, replace=False) v = array_functions.false(I.size) v[inds_to_keep] = True return v
def sample_from_clusters(self, W, cluster_inds, num_samples): v, counts = np.unique(cluster_inds, return_counts=True) counts = counts.astype(np.float) frequency = counts / counts.sum() is_representative = array_functions.false(cluster_inds.size) for idx, freq in zip(v, frequency): if freq > 1.5 / v.size: is_representative[cluster_inds == idx] = True if not is_representative.any(): is_representative[:] = True cluster_samples = np.random.choice(np.nonzero(is_representative)[0], num_samples, replace=False) return array_functions.make_vec_binary(cluster_samples, cluster_inds.size)
def subset_1_per_instance_id(): data = helper_functions.load_object('data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object('data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def generate_splits(self, y, num_splits=30, perc_train=.8, is_regression=False, keep_for_splitting=None): assert y.ndim == 1 keep_in_train_set = array_functions.false(len(y)) if keep_for_splitting is not None and len(keep_for_splitting) > 0: keep_in_train_set[~keep_for_splitting] = True #keep_in_train_set[~array_functions.to_boolean(keep_for_splitting)] = True is_labeled = ~np.isnan(y) keep_in_train_set[~is_labeled] = True n = len(y) #if keep_for_splitting is not None: # y_for_split = y[keep_for_splitting] # target_inds = keep_for_splitting.nonzero()[0] y_for_split = y[~keep_in_train_set] n_for_split = len(y_for_split) inds_for_splitting = (~keep_in_train_set).nonzero()[0] random_state = None if is_regression: split = cross_validation.ShuffleSplit(n_for_split, num_splits, 1 - perc_train, random_state=random_state) else: split = cross_validation.StratifiedShuffleSplit( y_for_split, num_splits, 1 - perc_train, random_state=random_state) splits = [] for train, test in split: s = data_lib.Split(n) s.is_train[:] = True s.is_train[inds_for_splitting[test]] = False ''' if keep_for_splitting is not None: s.is_train[:] = True s.is_train[target_inds[test]] = False else: s.is_train[train] = True s.is_train[test] = False ''' s.permutation = np.random.permutation(n) splits.append(s) return splits
def create_sampling_distribution(self, base_learner, data, fold_results): is_train_unlabeled = data.is_train & (~data.is_labeled) is_train_labeled = data.is_train & data.is_labeled inds = np.nonzero(is_train_unlabeled)[0] inds = inds[:50] I = array_functions.false(data.n) I[inds] = True x = data.x[I, :] x_labeled = data.x[is_train_labeled, :] if self.use_labeled: x_all = np.vstack((x, x_labeled)) self.transform.fit(x_all) x = self.transform.transform(x) x_labeled = self.transform.transform(x_labeled) else: x = self.transform.fit_transform(x) C = base_learner.params['alpha'] n = I.sum() t0 = np.zeros((n,1)) opt_data = OptimizationData(x, C) if self.use_labeled: opt_data.x_labeled = x_labeled constraints = [ { 'type': 'eq', 'fun': lambda t: t.sum() - 1 }, { 'type': 'ineq', 'fun': lambda t: t } ] options = {} results = optimize.minimize( lambda t: eval_oed(t, opt_data), t0, method='SLSQP', jac=None, options=options, constraints=constraints ) if results.success: t = results.x else: print 'OED Optimization failed' t = np.ones(n) t[t < 0] = 0 t += 1e-4 t /= t.sum() return t, inds
def rand_sample(self,perc=.1,to_sample=None): if to_sample is None: to_sample = array_functions.true(self.n) if to_sample.dtype != 'bool': I = array_functions.false(self.n) I[to_sample] = True to_sample = I to_keep = (~to_sample).nonzero()[0] to_sample = to_sample.nonzero()[0] p = np.random.permutation(to_sample.shape[0]) m = np.ceil(perc*p.shape[0]) to_use = to_sample[p[:m]] to_use = np.hstack((to_use,to_keep)) return self.get_subset(to_use)
def rand_sample(self, perc=.1, to_sample=None): if to_sample is None: to_sample = array_functions.true(self.n) if to_sample.dtype != 'bool': I = array_functions.false(self.n) I[to_sample] = True to_sample = I to_keep = (~to_sample).nonzero()[0] to_sample = to_sample.nonzero()[0] p = np.random.permutation(to_sample.shape[0]) m = int(np.ceil(perc * p.shape[0])) to_use = to_sample[p[:m]] to_use = np.hstack((to_use, to_keep)) return self.get_subset(to_use)
def subset_1_per_instance_id(): data = helper_functions.load_object( 'data_sets/' + create_data_set.adience_aligned_cnn_file) to_keep = array_functions.false(data.n) all_ids = np.unique(data.instance_ids) for id in all_ids: has_id = (data.instance_ids == id).nonzero()[0] to_keep[has_id[0]] = True pass to_keep = to_keep & data.is_labeled data = data.get_subset(to_keep) helper_functions.save_object( 'data_sets/' + create_data_set.adience_aligned_cnn_1_per_instance_id_file, data) pass
def create_sampling_distribution(self, base_learner, data, fold_results): is_train_unlabeled = data.is_train & (~data.is_labeled) is_train_labeled = data.is_train & data.is_labeled inds = np.nonzero(is_train_unlabeled)[0] inds = inds[:50] I = array_functions.false(data.n) I[inds] = True x = data.x[I, :] x_labeled = data.x[is_train_labeled, :] if self.use_labeled: x_all = np.vstack((x, x_labeled)) self.transform.fit(x_all) x = self.transform.transform(x) x_labeled = self.transform.transform(x_labeled) else: x = self.transform.fit_transform(x) C = base_learner.params['alpha'] n = I.sum() t0 = np.zeros((n, 1)) opt_data = OptimizationData(x, C) if self.use_labeled: opt_data.x_labeled = x_labeled constraints = [{ 'type': 'eq', 'fun': lambda t: t.sum() - 1 }, { 'type': 'ineq', 'fun': lambda t: t }] options = {} results = optimize.minimize(lambda t: eval_oed(t, opt_data), t0, method='SLSQP', jac=None, options=options, constraints=constraints) if results.success: t = results.x else: print 'OED Optimization failed' t = np.ones(n) t[t < 0] = 0 t += 1e-4 t /= t.sum() return t, inds
def transform(self, data): should_add_noise = array_functions.false(data.n) for i in range(self.num_clusters): idx = np.random.choice(data.n) cluster_inds = array_functions.find_knn(data.x, data.x[idx], k=self.n_per_cluster) should_add_noise[cluster_inds] = True if self.save_y_orig: data.y_orig = data.true_y.copy() if should_add_noise.any(): if self.flip_labels: data.flip_label(should_add_noise) else: data.true_y[should_add_noise] += self.y_offset data.y[should_add_noise] += self.y_offset data.is_noisy = should_add_noise return data
def create_synthetic_hypothesis_transfer(n=500, p=50, kt=1, ks=1, sigma=1.0, sigma_s=.3): wt = np.random.normal(0, sigma, p) all_data, w_eff = create_synthetic_linear_classification(n=n, p=p, sigma=sigma, w=wt) x = all_data.x all_data.data_set_ids = np.zeros(n) wt = w_eff data_set_counter = 1 diffs = [] is_target = array_functions.false(kt + ks) is_target[:kt] = True all_data.true_w = np.zeros((ks + kt + 1, p)) all_data.true_w[0, :] = wt for i, val in enumerate(is_target): data_set_id = data_set_counter data_set_counter += 1 if val: ws = wt + np.random.normal(0, sigma_s, p) ws = wt else: ws = np.random.normal(0, sigma, p) source_data, ws = create_synthetic_linear_classification(w=ws, x=x) source_data.data_set_ids = data_set_id * np.ones(n) #source_data.true_y *= (i+2) source_data.y = source_data.true_y all_data.combine(source_data) diff = norm(wt / norm(wt) - ws / norm(ws)) diffs.append(diff) all_data.true_w[data_set_id, :] = ws all_data.true_w = all_data.true_w.T all_data.metadata = dict() all_data.metadata['true_w'] = all_data.true_w s = synthetic_hypothesis_transfer_class_file % \ (str(n) + '-' + str(p) + '-' + str(sigma) + '-' + str(sigma_s) + '-' + str(kt) + '-' + str(ks)) helper_functions.save_object(s, all_data)
def generate_splits(self,y,num_splits=30,perc_train=.8,is_regression=False,keep_for_splitting=None): assert y.ndim == 1 keep_in_train_set = array_functions.false(len(y)) if keep_for_splitting is not None and len(keep_for_splitting) > 0: keep_in_train_set[~keep_for_splitting] = True #keep_in_train_set[~array_functions.to_boolean(keep_for_splitting)] = True is_labeled = ~np.isnan(y) keep_in_train_set[~is_labeled] = True n = len(y) #if keep_for_splitting is not None: # y_for_split = y[keep_for_splitting] # target_inds = keep_for_splitting.nonzero()[0] y_for_split = y[~keep_in_train_set] n_for_split = len(y_for_split) inds_for_splitting = (~keep_in_train_set).nonzero()[0] random_state = None if is_regression: split = cross_validation.ShuffleSplit(n_for_split,num_splits,1-perc_train,random_state=random_state) else: split = cross_validation.StratifiedShuffleSplit(y_for_split,num_splits,1-perc_train,random_state=random_state) splits = [] for train,test in split: s = data_lib.Split(n) s.is_train[:] = True s.is_train[inds_for_splitting[test]] = False ''' if keep_for_splitting is not None: s.is_train[:] = True s.is_train[target_inds[test]] = False else: s.is_train[train] = True s.is_train[test] = False ''' s.permutation = np.random.permutation(n) splits.append(s) return splits
def create_20ng_data(file_dir=''): newsgroups_train = datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) data = data_class.Data() short_names = [ #0 'A', #1-5 'C1', 'C2', 'C3', 'C4', 'C5', #6 'M', #7-10 'R1', 'R2', 'R3', 'R4', #11-14 'S1', 'S2', 'S3', 'S4', #15 'O', #16-19 'T1', 'T2', 'T3', 'T4' ] y = newsgroups_train.target #l = [1,2,7,8,12,17] #l = [1,2,7,8,12,13] #l = [0,1,2,3,4,5,7,8,9,10,11,12,13,14,16,17,18,19] l = [0, 1, 2, 7, 8, 11, 12, 16, 17] #l = [0, 1, 2, 3, 4, 7, 8, 9, 10,11,12,13,14,16,17,18,19] data.label_names = [short_names[i] for i in l] I = array_functions.false(len(newsgroups_train.target)) for i in l: I = I | (y == i) #I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16 I = I.nonzero()[0] max_df = .5 min_df = .01 #max_df = .95 #min_df = .001 #max_df = .1 #min_df = .01 newsgroups_train.data = [newsgroups_train.data[i] for i in I] newsgroups_train.target = newsgroups_train.target[I] tf_idf = TfidfVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=max_features) vectors = tf_idf.fit_transform(newsgroups_train.data) feature_counts = (vectors > 0).sum(0) vocab = helper_functions.invert_dict(tf_idf.vocabulary_) num_feats = len(vocab) vocab = [vocab[i] for i in range(num_feats)] #pca = PCA(n_components=pca_feats) #v2 = pca.fit_transform(vectors.toarray()) v2 = vectors.toarray() vectors = v2 y = newsgroups_train.target.copy() ''' y[y==7] = 1 y[(y==2) | (y==8)] = 2 y[(y==12) | (y==17)] = 3 ''' ''' y[y == 2] = 1 y[(y==7) | (y==8)] = 2 y[(y==12) | (y==13)] = 3 #I_f = (y==1) | (y==7) | (y==11) | (y==16) I_f = array_functions.true(vectors.shape[0]) f = f_classif k_best = SelectKBest(score_func=f, k=pca_feats) v2 = k_best.fit_transform(vectors[I_f,:], y[I_f]) k_best.transform(vectors) s = k_best.get_support() selected_vocab = [vocab[i] for i in s.nonzero()[0]] vocab = selected_vocab vectors = v2 ''' data.x = vectors data.y = newsgroups_train.target data.set_train() data.set_target() data.set_true_y() data.is_regression = False data.feature_names = vocab class_counts = array_functions.histogram_unique(data.y) s = ng_raw_data_file if file_dir != '': s = file_dir + '/' + s helper_functions.save_object(s, data)
file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = [s + ' Mean' for s in [ 'NO2', 'O3', 'SO2', 'CO', ]] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) to_keep = array_functions.false(data.shape[0]) date_strs = data[:, find_first_element(feat_names, 'Date Local')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() if prev != date_str: to_keep[i] = True prev = date_str data = data[to_keep, :] date_strs = date_strs[to_keep] date_ids = date_ids.astype(np.int) date_ids = date_ids[to_keep]
region_ids_centroids = np.asarray(centroids_data.RegionID) region_ids_centroids = region_ids_centroids.astype(np.int) pricing_data = string_data.values[:, [year1_idx, year2_idx]] pricing_data = vec_replace(pricing_data).astype(np.float) #I_data = np.argsort(region_ids_data) I_centroids = np.argsort(region_ids_centroids) #r_data_sorted = region_ids_data[I_data] r_centroids_sorted = region_ids_centroids[I_centroids] #assert (r_data_sorted == r_centroids_sorted).all() centroid_x = np.asarray(centroids_data.X).astype(np.float) centroid_y = np.asarray(centroids_data.Y).astype(np.float) locs = np.stack((centroid_x, centroid_y),1) locs = locs[I_centroids, :] ca_pricing_data = np.zeros((centroid_x.shape[0], 2)) has_data = array_functions.false(ca_pricing_data.shape[0]) for i, id in enumerate(r_centroids_sorted): if (id == region_ids_data).sum() == 1: ca_pricing_data[i, :] = pricing_data[id == region_ids_data, :] has_data[i] = True locs = locs[has_data, :] locations = locs pricing_data = ca_pricing_data[has_data, :] I = np.isfinite(pricing_data[:, 0]) & np.isfinite(pricing_data[:, 1]) I &= array_functions.in_range(locations[:,0], day_locs[:,0].min(), day_locs[:,0].max()) I &= array_functions.in_range(locations[:,1], day_locs[:,1].min(), day_locs[:,1].max()) #I &= array_functions.in_range(locations[:,0], -123, -121) #I &= array_functions.in_range(locations[:,1], 37, 39) # I &= (state == 'OR')
dtype='str', delim=',', num_rows=1000000000 ) inds_to_use = np.asarray([j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique(data[:, find_first_element(feat_names, 'STATION')].astype(np.str)) curr_stations = data_curr[:, find_first_element(feat_names, 'STATION')].astype(np.str) to_remove = array_functions.false(data_curr.shape[0]) for s in np.unique(curr_stations): if s not in unique_stations: continue print 'Found repeated station, removing: ' + s to_remove = to_remove | (curr_stations == s) data = np.vstack((data, data_curr[~to_remove,:])) y_names = ['TAVG', 'TMIN', 'TMAX', 'PRCP'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'DATE')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs):
def get_range(self, y_range): I = array_functions.false(self.n) I[y_range[0]:y_range[1]] = True return self.get_subset(I)
def get_nth(self, n): I = array_functions.false(self.n) I[::n] = True return self.get_subset(I)
j for j in range(feat_names_curr.size) if feat_names_curr[j] in feats_to_keep ]) assert inds_to_use.size == len(feats_to_keep) data_curr = data_curr[:, inds_to_use] feat_names_curr = feat_names_curr[inds_to_use] if i == 0: feat_names = feat_names_curr data = data_curr continue unique_stations = np.unique( data[:, find_first_element(feat_names, 'STATION')].astype(np.str)) curr_stations = data_curr[:, find_first_element(feat_names, 'STATION' )].astype(np.str) to_remove = array_functions.false(data_curr.shape[0]) for s in np.unique(curr_stations): if s not in unique_stations: continue print 'Found repeated station, removing: ' + s to_remove = to_remove | (curr_stations == s) data = np.vstack((data, data_curr[~to_remove, :])) y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'DATE')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str)
''' for d in unique_dates: times_series_vals[d,i] = y[I[dates_idx == d]].mean() pass ''' ''' for j in I: print date_strs[j] ''' ''' print 'num_items: ' + str(I.size) print 'start: ' + date_strs[I[0]] print 'end: ' + date_strs[I[-1]] ''' has_loc = array_functions.false(unique_series_ids.size) for i, id in enumerate(unique_series_ids): has_loc[i] = id in station_names times_series_vals = times_series_vals[:, has_loc] unique_series_ids = unique_series_ids[has_loc] date_idx = 0 for i in range(0,num_days, 5): x = station_locs y = times_series_vals[i:120:28,:] array_functions.plot_heatmap(x, y.T, title=None, sizes=30) data = (times_series_vals,unique_series_ids) helper_functions.save_object('processed_data.pkl', data) pass
def create_20ng_data(file_dir=""): newsgroups_train = datasets.fetch_20newsgroups(subset="train", remove=("headers", "footers", "quotes")) data = data_class.Data() short_names = [ # 0 "A", # 1-5 "C1", "C2", "C3", "C4", "C5", # 6 "M", # 7-10 "R1", "R2", "R3", "R4", # 11-14 "S1", "S2", "S3", "S4", # 15 "O", # 16-19 "T1", "T2", "T3", "T4", ] data.label_names = short_names y = newsgroups_train.target l = [1, 2, 7, 8, 12, 17] # l = [1,2,7,8,12,13] I = array_functions.false(len(newsgroups_train.target)) for i in l: I = I | (y == i) # I = y == 1 | y == 2 | y == 7 | y == 7 | y == 11 | y == 16 I = I.nonzero()[0] max_df = 0.95 min_df = 0.001 # max_df = .1 # min_df = .01 newsgroups_train.data = [newsgroups_train.data[i] for i in I] newsgroups_train.target = newsgroups_train.target[I] tf_idf = TfidfVectorizer(stop_words="english", max_df=max_df, min_df=min_df, max_features=max_features) vectors = tf_idf.fit_transform(newsgroups_train.data) feature_counts = (vectors > 0).sum(0) vocab = helper_functions.invert_dict(tf_idf.vocabulary_) num_feats = len(vocab) vocab = [vocab[i] for i in range(num_feats)] pca = PCA(n_components=pca_feats) v2 = pca.fit_transform(vectors.toarray()) vectors = v2 y = newsgroups_train.target.copy() """ y[y==7] = 1 y[(y==2) | (y==8)] = 2 y[(y==12) | (y==17)] = 3 """ """ y[y == 2] = 1 y[(y==7) | (y==8)] = 2 y[(y==12) | (y==13)] = 3 #I_f = (y==1) | (y==7) | (y==11) | (y==16) I_f = array_functions.true(vectors.shape[0]) f = f_classif k_best = SelectKBest(score_func=f, k=pca_feats) v2 = k_best.fit_transform(vectors[I_f,:], y[I_f]) k_best.transform(vectors) s = k_best.get_support() selected_vocab = [vocab[i] for i in s.nonzero()[0]] vocab = selected_vocab vectors = v2 """ data.x = vectors data.y = newsgroups_train.target data.set_defaults() data.is_regression = False data.feature_names = vocab class_counts = array_functions.histogram_unique(data.y) s = ng_raw_data_file if file_dir != "": s = file_dir + "/" + s helper_functions.save_object(s, data)
def create_sampling_distribution(self, base_learner, data, fold_results): cluster_scale = self.cluster_scale source_learner = deepcopy(self.base_learner) source_data = data.get_transfer_subset(self.configs.source_labels) if source_data.n > 1000: source_data = source_data.rand_sample(.2) print 'subsampling source data: ' + str(source_data.n) if source_data.is_regression: source_data.data_set_ids[:] = self.configs.target_labels[0] else: source_data.change_labels(self.configs.source_labels, self.configs.target_labels) tic() source_learner.train_and_test(source_data) print 'train source time: ' + toc_str() target_data = data.get_transfer_subset(self.configs.target_labels, include_unlabeled=True) y_pred = source_learner.predict(data).y if self.use_oracle_target: target_learner = deepcopy(self.base_learner) oracle_target_data = deepcopy(target_data) oracle_target_data.y = oracle_target_data.true_y oracle_target_data.is_train[:] = True target_learner.train_and_test(oracle_target_data) y_pred_target = target_learner.predict(data).y y_pred = y_pred_target if self.use_oracle_labels: y_pred = data.true_y.copy() n_items = self.configs.active_items_per_iteration I = data.is_train if not self.use_warm_start: I &= ~data.is_labeled if self.configs.target_labels is not None: I &= data.get_transfer_inds(self.configs.target_labels) I = I.nonzero()[0] if self.max_items_for_instance_selection is not None and \ I.size > self.max_items_for_instance_selection: I = np.random.choice(I, self.max_items_for_instance_selection, replace=False) print 'subsampling target data: ' + str(I.size) labeled_target_data = deepcopy(data.get_subset(I)) instances_to_keep = labeled_target_data.is_labeled labeled_target_data.set_train() labeled_target_data.is_noisy = array_functions.false( labeled_target_data.n) labeled_target_data.y = y_pred[I].copy() labeled_target_data.true_y = y_pred[I].copy() labeled_target_data.y_orig = y_pred[I].copy() labeled_target_data.instances_to_keep = instances_to_keep #labeled_target_data.y_orig = labeled_target_data.true_y.copy() if self.use_instance_selection: self.instance_selector.subset_size = n_items self.instance_selector.num_samples = n_items self.instance_selector.configs.use_validation = False self.instance_selector.configs.use_training = True self.instance_selector.train_and_test(labeled_target_data) is_selected = self.instance_selector.predict( labeled_target_data).is_selected scores = np.ones(is_selected.size) #Lower score is better scores[is_selected] = 0 scores_sorted_inds = np.argsort(scores) print '' elif self.use_density: target_learner = deepcopy(self.base_learner) target_learner.train_and_test(labeled_target_data) vars = self.estimate_variance( target_learner, labeled_target_data, ) densities = self.estimate_density(labeled_target_data) else: X_sub = data.x[I, :] tic() X_cluster_space, cluster_ids = self.create_clustering( X_sub, int(cluster_scale * self.configs.active_items_per_iteration)) print 'cluster target time: ' + toc_str() vars, cluster_n = self.get_cluster_purity( cluster_ids, y_pred[I], not target_data.is_regression) true_vars, true_cluster_n = self.get_cluster_purity( cluster_ids, data.true_y[I], not target_data.is_regression) if self.use_target_variance: vars = true_vars centroid_idx = self.get_cluster_centroids(X_cluster_space) densities = cluster_n if self.use_instance_selection: pass else: scores = vars / densities scores_sorted_inds = np.argsort(scores) # Don't sample instances if cluster size is 1 if not self.use_density and not self.use_instance_selection: scores[cluster_n <= .005 * I.size] = np.inf to_use = centroid_idx[scores_sorted_inds[:n_items]] else: to_use = scores_sorted_inds[:n_items] if self.transfer_hyperparameters: target_learner = deepcopy(self.base_learner) target_learner.configs.use_validation = True labeled_target_data.y[~is_selected] = np.nan target_learner.train_and_test(labeled_target_data) self.base_learner.base_learner.cv_params = {'unused': [0]} self.base_learner.base_learner.best_params = target_learner.base_learner.best_params self.base_learner.base_learner.set_params( **target_learner.base_learner.best_params) d = np.zeros(data.y.shape) d[I[to_use]] = 1 d = d / d.sum() return d, d.size
''' for d in unique_dates: times_series_vals[d,i] = y[I[dates_idx == d]].mean() pass ''' ''' for j in I: print date_strs[j] ''' ''' print 'num_items: ' + str(I.size) print 'start: ' + date_strs[I[0]] print 'end: ' + date_strs[I[-1]] ''' has_loc = array_functions.false(unique_series_ids.size) for i, id in enumerate(unique_series_ids): has_loc[i] = id in station_names times_series_vals = times_series_vals[:, has_loc] unique_series_ids = unique_series_ids[has_loc] date_idx = 0 for i in range(0, num_days, 5): x = station_locs y = times_series_vals[i:120:28, :] array_functions.plot_heatmap(x, y.T, title=None, sizes=30) data = (times_series_vals, unique_series_ids) helper_functions.save_object('processed_data.pkl', data) pass
d = datetime.date(year, month, day) return d feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = [s + ' Mean' for s in ['NO2', 'O3', 'SO2', 'CO', ]] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) to_keep = array_functions.false(data.shape[0]) date_strs = data[:, find_first_element(feat_names, 'Date Local')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) for i, date_str in enumerate(date_strs): date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() if prev != date_str: to_keep[i] = True prev = date_str data = data[to_keep, :] date_strs = date_strs[to_keep] date_ids = date_ids.astype(np.int) date_ids = date_ids[to_keep]