def get_transfer_inds(self, labels_or_ids): if labels_or_ids is None: return array_functions.true(self.n) if self.is_regression: return array_functions.find_set(self.data_set_ids, labels_or_ids) else: return array_functions.find_set(self.true_y, labels_or_ids)
def reveal_labels(self, inds=None): if inds is None: assert False, 'Is this a good way of doing this? Wouldn''t "None" imply nothing should be revealed?' inds = array_functions.true(self.n) #if inds are pairwise relationships try: #Old instances may be missing 'pairwise_relationships' or it will be a list if not hasattr(self,'pairwise_relationships') or len(self.pairwise_relationships) == 0: self.pairwise_relationships = set() assert inds.shape[1] == 2 #assert np.asarray([len(i) == 2 for i in inds]).all() inds_set = set() for x1, x2 in inds: if len(Set([(x1,x2),(x2,x1)]) & self.pairwise_relationships) > 0: continue item = (x2,x1) if self.true_y[x1] <= self.true_y[x2]: item = (x1,x2) self.pairwise_relationships.add(item) #If inds are for instances except TypeError as error: self.y[inds] = self.true_y[inds] except IndexError as error: self.y[inds] = self.true_y[inds] #If 'pairwise' data has a number of indices other than 2 except AssertionError: assert False, 'Number of inds for pairwise data must be 2'
def eval_variance(data, a): x, y = data.get_xy() C, C2, C3 = data.get_reg() n = x.shape[0] p = x.shape[1] t = StandardScaler() loss = 0 #C = 1000 #a[:] = 1 for i in range(n): I = array_functions.true(n) I[i] = False xi = x[i, :] yi = y[i] xmi = t.fit_transform(x[I, :]) ymi = y[I] bi = ymi.mean() #w = np.linalg.lstsq(xmi.T.dot(xmi) + C*D_a, xmi.T.dot(ymi))[0] w = MixedFeatureGuidanceMethod.solve_w(a, xmi, ymi, C) loss += (xi.T.dot(w) + bi - yi)**2 ''' ridge = linear_model.Ridge(C, normalize=False) ridge.fit(xmi, ymi) b_ridge = ridge.intercept_ w_ridge = ridge.coef_ rel_err = array_functions.relative_error(w_ridge, w) ''' pass loss = loss / n reg = 0 #reg = C2*norm(a - C*np.ones(p))**2 reg = C2 * norm(a)**2 #reg = C2 * norm(a, 1) return loss + reg
def optimize(self, opt_data): assert opt_data.instances_to_keep is None, 'Not implemented yet!' W_x = array_functions.make_rbf(opt_data.X, self.sigma_x) W = W_x if not self.no_f_x: W_y = array_functions.make_rbf(opt_data.Y, self.sigma_y) W = W_x * W_y n = W.shape[0] selected = array_functions.false(W.shape[0]) splits = [array_functions.true(n)] num_per_split = [opt_data.subset_size] if self.num_class_splits is not None: assert self.num_class_splits == 2 I1 = opt_data.Y <= opt_data.Y.mean() splits = [I1, ~I1] num_per_split = [opt_data.subset_size/2, opt_data.subset_size/2] for split, num in zip(splits, num_per_split): W_split = W[np.ix_(split, split)] split_selections = self.optimize_for_data(W_split, num) split_inds = split.nonzero()[0] selected[split_inds[split_selections]] = True #selected = self.compute_centroids_for_spectral_clustering(W, cluster_inds) self.W = W self.selected = selected if selected.sum() < opt_data.subset_size: #print 'Empty clusters' pass #self.learned_distribution = compute_p(selected, opt_data) self.learned_distribution = selected self.optimization_value = 0
def reveal_labels(self, inds=None): if inds is None: assert False, 'Is this a good way of doing this? Wouldn' 't "None" imply nothing should be revealed?' inds = array_functions.true(self.n) #if inds are pairwise relationships try: #Old instances may be missing 'pairwise_relationships' or it will be a list if not hasattr(self, 'pairwise_relationships') or len( self.pairwise_relationships) == 0: self.pairwise_relationships = set() assert inds.shape[1] == 2 #assert np.asarray([len(i) == 2 for i in inds]).all() inds_set = set() for x1, x2 in inds: if len( Set([(x1, x2), (x2, x1)]) & self.pairwise_relationships) > 0: continue item = (x2, x1) if self.true_y[x1] <= self.true_y[x2]: item = (x1, x2) self.pairwise_relationships.add(item) #If inds are for instances except TypeError as error: self.y[inds] = self.true_y[inds] except IndexError as error: self.y[inds] = self.true_y[inds] #If 'pairwise' data has a number of indices other than 2 except AssertionError: assert False, 'Number of inds for pairwise data must be 2'
def eval_variance(data, a): x, y = data.get_xy() C, C2, C3 = data.get_reg() n = x.shape[0] p = x.shape[1] t = StandardScaler() loss = 0 #C = 1000 #a[:] = 1 for i in range(n): I = array_functions.true(n) I[i] = False xi = x[i, :] yi = y[i] xmi = t.fit_transform(x[I, :]) ymi = y[I] bi = ymi.mean() #w = np.linalg.lstsq(xmi.T.dot(xmi) + C*D_a, xmi.T.dot(ymi))[0] w = MixedFeatureGuidanceMethod.solve_w(a, xmi, ymi, C) loss += (xi.T.dot(w) + bi - yi)**2 ''' ridge = linear_model.Ridge(C, normalize=False) ridge.fit(xmi, ymi) b_ridge = ridge.intercept_ w_ridge = ridge.coef_ rel_err = array_functions.relative_error(w_ridge, w) ''' pass loss = loss / n reg = 0 #reg = C2*norm(a - C*np.ones(p))**2 reg = C2 * norm(a) ** 2 #reg = C2 * norm(a, 1) return loss + reg
def transform(self, data): assert not data.is_regression y = data.classes to_keep = array_functions.true(data.n) if self.num_to_select is not None: for yi, perc in zip(y, self.num_to_select): inds = (data.y == yi).nonzero()[0] n_to_keep = np.ceil(inds.size * perc) to_keep[inds[n_to_keep:]] = False data = data.get_subset(to_keep) return data
def rand_sample(self,perc=.1,to_sample=None): if to_sample is None: to_sample = array_functions.true(self.n) if to_sample.dtype != 'bool': I = array_functions.false(self.n) I[to_sample] = True to_sample = I to_keep = (~to_sample).nonzero()[0] to_sample = to_sample.nonzero()[0] p = np.random.permutation(to_sample.shape[0]) m = np.ceil(perc*p.shape[0]) to_use = to_sample[p[:m]] to_use = np.hstack((to_use,to_keep)) return self.get_subset(to_use)
def rand_sample(self, perc=.1, to_sample=None): if to_sample is None: to_sample = array_functions.true(self.n) if to_sample.dtype != 'bool': I = array_functions.false(self.n) I[to_sample] = True to_sample = I to_keep = (~to_sample).nonzero()[0] to_sample = to_sample.nonzero()[0] p = np.random.permutation(to_sample.shape[0]) m = int(np.ceil(perc * p.shape[0])) to_use = to_sample[p[:m]] to_use = np.hstack((to_use, to_keep)) return self.get_subset(to_use)
def add_noise(self, noise_rate, I=None, classes=None): assert not self.is_regression assert self.is_regression is not None if I is None: I = array_functions.true(self.n) if classes is None: I = self.classes to_switch = np.random.rand(self.n) <= noise_rate to_switch = to_switch.nonzero()[0] for i in to_switch: if not I[i]: continue old_y = self.y[i] y_ind = classes == old_y assert any(classes) p = np.ones(len(classes)) * 1.0 / (len(classes) - 1) p[classes == old_y] = 0 new_y = np.random.choice(classes, p=p) self.y[i] = new_y self.true_y[i] = new_y
def add_noise(self, noise_rate, I=None, classes=None): assert not self.is_regression assert self.is_regression is not None if I is None: I = array_functions.true(self.n) if classes is None: I = self.classes to_switch = np.random.rand(self.n) <= noise_rate to_switch = to_switch.nonzero()[0] for i in to_switch: if not I[i]: continue old_y = self.y[i] y_ind = classes == old_y assert any(classes) p = np.ones(len(classes)) * 1.0 / (len(classes)-1) p[classes == old_y] = 0 new_y = np.random.choice(classes,p=p) self.y[i] = new_y self.true_y[i] = new_y
file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000 ) y_names = ['Value'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'YYYYMM')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) to_keep = array_functions.true(date_strs.shape[0]) for i, date_str in enumerate(date_strs): if date_str[4:] == '13' or data[i, y_inds] == 'Not Available': to_keep[i] = False continue date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids[to_keep] data = data[to_keep, :] date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
def __init__(self, n=0): self.permutation = np.empty(n) self.is_train = array_functions.true(n)
def set_train(self): self.is_train = array_functions.true(self.n)
def select_all(data): return array_functions.true(data.n)
def __init__(self,n=0): self.permutation = np.empty(n) self.is_train = array_functions.true(n)
feat_names, data = create_data_set.load_csv( file_name, True, dtype='str', delim=',', #num_rows=40000 num_rows=100000000000) y_names = ['Value'] y_inds = [] for name in y_names: y_inds.append(array_functions.find_first_element(feat_names, name)) date_strs = data[:, find_first_element(feat_names, 'YYYYMM')] prev = '' date_str_to_idx = dict() date_ids = np.zeros(data.shape[0]) to_keep = array_functions.true(date_strs.shape[0]) for i, date_str in enumerate(date_strs): if date_str[4:] == '13' or data[i, y_inds] == 'Not Available': to_keep[i] = False continue date_obj = to_date(date_str) date_str_to_idx[date_str] = date_obj.toordinal() date_ids[i] = date_obj.toordinal() date_ids = date_ids[to_keep] data = data[to_keep, :] date_ids = date_ids.astype(np.int) y = data[:, y_inds].astype(np.float) #y_sub = y[I, :] #series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)