def __call__(self, df, label_column): ''' Perform data activity here :param df: dataframe object :param label_column: string, name of the column :return: transformed dataframe object ''' self.label_column = label_column if not self.label_column: self.label_column = df.columns[-1] if self.validation: assert self.validate(df) df_copy = df.copy() label_values = df_copy[label_column] df_copy = df_copy.drop(label_column, axis=1) rp = None if self.proj_type == 'Gaussian': rp = random_projection.GaussianRandomProjection(self.n_components) elif self.proj_type == 'Sparse': rp = random_projection.SparseRandomProjection(self.n_components) rp.fit(df_copy) columns = [self.proj_type[:3]+'_%i' % i for i in range(self.n_components)] df_copy = pd.DataFrame(rp.transform(df_copy), columns=columns, index=df.index) df_copy[label_column] = label_values return df_copy
def bow2rnd_proj(bow, projection_type='sparse', eps=0.3): ''' INPUT bow: bag-of-words VxD numpy matrix projection_type: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse eps: threshold for acceptable distorsions higher eps -> higher theoretical probability of distorsions is bounded between 0-1 OUTPUT rnd_proj: vxD matrix v << V ''' try: projection_type = projection_type.lower() if projection_type == 'gaussian': transformer = random_projection.GaussianRandomProjection(eps=eps) elif projection_type == 'sparse': transformer = random_projection.SparseRandomProjection(eps=eps) else: raise ValueError("only handles 'gaussian' or 'sparse'") resultT = transformer.fit_transform(bow.T) result = resultT.T except ex: result = None return result
def apply_rp(data, components, indicator, k1, k2): print('Random projection for ', indicator) rand_kmean_scores = [] rand_emm_scores = [] # validate RMSE for reconstruction for component in components: transformer = random_projection.SparseRandomProjection( n_components=component, random_state=150) X_transformed = transformer.fit_transform(data['features']) rand_kmean_scores.append( validate_k_fixed(X_transformed, data['labels'], k1)) rand_emm_scores.append( validate_em_k_fixed(X_transformed, data['labels'], k2)) print('k means adj rand scores => ', rand_kmean_scores) plt.style.use("seaborn") plt.plot(components, rand_kmean_scores, marker='o') plt.xticks(components, rotation="90") plt.xlabel("RP Components") plt.ylabel('Adjusted rand scores') plt.savefig('plots/dr/rp/' + indicator + '/kmeans/rp_adj_rand_scores.png') plt.clf() print('em adj rand scores => ', rand_emm_scores) plt.style.use("seaborn") plt.plot(components, rand_emm_scores, marker='o') plt.xticks(components, rotation="90") plt.xlabel("RP Components") plt.ylabel('Adjusted rand scores') plt.savefig('plots/dr/rp/' + indicator + '/em/rp_adj_rand_scores.png') plt.clf()
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False): tformFile = './data/transformData.pkl' if bertFeats is None: with open(tformFile, 'rb') as ifile: dataMap = pkl.load(ifile) else: print('Regenerating transform data...') dataMap = { 'PCA': PCA().fit_transform(bertFeats), 'TSNE': TSNE().fit_transform(bertFeats), 'Agglomeration': FeatureAgglomeration().fit_transform(bertFeats), 'Gaussian Projection': random_projection.GaussianRandomProjection(2).fit_transform( bertFeats), 'Sparse Projection': random_projection.SparseRandomProjection(2).fit_transform( bertFeats) } with open(tformFile, 'wb') as ofile: pkl.dump(dataMap, ofile) for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'): for tform in dataMap: tfData = dataMap[tform] grid = makeDataPlots(tfData, sarcasmDf, *combo, tform) if show: grid.show() title = grid.windowTitle() saveGrid(grid, imgDir / f'{title}.jpg')
def get_rp_reducer(X_train, k, stats=False): from sklearn import random_projection #X_train, y_train, X_test, y_test = get_prepared_training_and_test_data(dataset) features = X_train.shape[1] algo="rp" if not k: k = features best_reducer = None best_reducer_loss = 100 losses = [] reducers_to_try = 25 for _ in range(0,reducers_to_try): reducer = random_projection.SparseRandomProjection(n_components=k) reducer.fit(X_train) X_train_reduced = reducer.transform(X_train) X_projected = reducer.inverse_transform(X_train_reduced) loss = ((X_train - X_projected) ** 2).mean() if stats: losses.append(loss) if loss < best_reducer_loss: best_reducer = reducer best_reducer_loss = loss if stats: mean=np.mean(losses) variance=np.var(losses) stddev = np.sqrt(variance) return best_reducer,best_reducer_loss,mean,stddev else: return best_reducer
def plot_with_dpp(pred_c): rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(pred_c) plt.figure(figsize=(4, 4), dpi=50) for p in X_projected: plt.scatter(p[0],p[1],color="blue") plt.show()
def random_proj_sparse_random(X, n_comp): rp = random_projection.SparseRandomProjection(n_components=n_comp, random_state=42) X_projected = rp.fit_transform(X) del rp return X_projected
def __init__(self,d,n_estimators=150): self.pipeline = make_pipeline(TfidfVectorizer(),\ random_projection.SparseRandomProjection(),\ RandomForestClassifier(n_estimators=n_estimators,n_jobs=-1,oob_score=True)) self.d = d
def bow2random_projection(bow, eps=0.3, projection_type='sparse'): ''' INPUT bow: bag-of-words VxD numpy matrix type: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse OUTPUT proj: vxD matrix v << V ''' try: projection_type = projection_type.lower() if projection_type == 'gaussian': transformer = random_projection.GaussianRandomProjection(eps=eps) elif projection_type == 'sparse': transformer = random_projection.SparseRandomProjection(eps=eps) else: raise ValueError("only handles 'gaussian' or 'sparse'") resultT = transformer.fit_transform(bow.T) result = resultT.T except ex: result = None return result
def project_features(data, n_components, display=False): features, weights, labels = data feature_names = features.columns.tolist() start = time() rp = random_projection.SparseRandomProjection(n_components=n_components) rp.fit(features) rp.transform(features) return rp
def SparseRandomProjection(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = random_projection.SparseRandomProjection(n_components=2) result = {} result['data'] = pca.fit_transform(data_source) result['params'] = pca.density_ #错误 return result
def sparseRandomProjection(data, label, new_dimension): print ("start sparse random projection...") start = time.time() transformer = random_projection.SparseRandomProjection(n_components=new_dimension) reduced = transformer.fit_transform(data) end = time.time() #print (" took %f" % (end - start)) return (reduced, end-start)
def using_random(X, s=None): print "using random" print("Computing random projection") rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) #plot_embedding(X_projected, "Random Projection of the results") plot_our_embedding(X_projected, "Random Projection of the results", s)
def transform(data, n_components=3): features, weights, labels = data start = time() rp = random_projection.SparseRandomProjection(n_components=n_components) rp.fit(features) transformed = rp.transform(features) elapsed = time() - start df = pd.DataFrame(transformed) return df, elapsed
def do_random_projections(X, Y=None): from sklearn import random_projection rp = random_projection.SparseRandomProjection(n_components=2, random_state=93) X_projected = rp.fit_transform(X) do_plot(X_projected[:, 0], X_projected[:, 1], Y) return
def plot_sparse_random_projection(X, y, random_state=42): """ Random 2D projection using a random unitary matrix """ n_components = 2 # Because of 2D project, it is fixed to 2. print("Computing random projection") rp = random_projection.SparseRandomProjection(n_components=n_components, random_state=random_state) X_projected = rp.fit_transform(X) plot_embedding(X_projected, y, "Random Projection of the digits")
def randomp(X, dim=2, **kargs): '''Random 2D projection using a random unitary matrix''' print("Computing random projection") try: rp = random_projection.SparseRandomProjection(n_components=dim, random_state=42) X_projected = rp.fit_transform(X) return rp, X_projected, "Random Projection" except Exception as e: traceback.print_exc()
def validate_rp_nn(data, components, label): mlp = MLPClassifier(hidden_layer_sizes=(15, 2), random_state=70, activation='relu', max_iter=500) scoring = ['accuracy'] scores = cross_validate(mlp, data['features'], data['labels'], scoring=scoring, cv=10) print(scores) NN_fit_time = np.mean(scores['fit_time']) NN_accuracy = np.mean(scores['test_accuracy']) print(NN_fit_time) print(NN_accuracy) PCA_fit_time = [] PCA_accuracy = [] for component in components: rp = random_projection.SparseRandomProjection(n_components=component, random_state=150) X_transformed = rp.fit_transform(data['features']) scores_pca = cross_validate(mlp, X_transformed, data['labels'], scoring=scoring, cv=10) print(scores_pca) PCA_fit_time.append(np.mean(scores_pca['fit_time'])) PCA_accuracy.append(np.mean(scores_pca['test_accuracy'])) plt.style.use("seaborn") plt.figure(figsize=(8, 8)) plt.plot(components, PCA_accuracy) plt.xticks(components) plt.axhline(y=NN_accuracy, color='r', linestyle='-') plt.xlabel("Principal Components") plt.ylabel('NN Accuracy') plt.grid(True) plt.savefig('plots/dr/rp/' + label + '/rp_accuracy.png') plt.clf() plt.style.use("seaborn") plt.plot(components, PCA_fit_time) plt.xticks(components) plt.axhline(y=NN_fit_time, color='r', linestyle='-') plt.xlabel("Principal Components") plt.ylabel('NN Fit Time') plt.grid(True) plt.savefig('plots/dr/rp/' + label + '/rp_fit_time.png') plt.clf()
def SparseRandomProjection(array, percent_samples): print "Sparse Random Projection", percent_samples * 100, "% of training data." print "Features\tTime" array = array[:int(percent_samples * len(array))] for pct in pct_features_list: num_features = int(pct * len(array[0])) start = time() Y = random_projection.SparseRandomProjection().fit_transform(array) end = time() print num_features, "\t", (end - start)
def create_features(self): n_com = 100 transformer = random_projection.SparseRandomProjection( n_components=n_com) self.train = pd.DataFrame(transformer.fit_transform(train)) self.test = pd.DataFrame(transformer.transform(test)) columns = ["RandomProjection{}".format(i) for i in range(n_com)] self.train.columns = columns self.test.columns = columns
def comp_projmat(data, **kwargs): """ returns a projection matrix Warning: the projection matrix returned can be either dense or sparse """ namelist = ['breiman', 'ho', 'tomita', 'dasgupta'] assert kwargs[ 'name'] in namelist, "No such method for constructing projection matrix!" if kwargs['name'] == 'breiman': ## Breiman's Forest-IC and Forest-RC s = kwargs['sparsity'] d = kwargs['target_dim'] A = np.zeros((data.shape[1], d)) ## sample sparsity-constrained A for i in range(d): ind = np.random.choice(data.shape[1], size=s, replace=False) if s == 1: A[ind, i] = 1 else: for j in range(len(ind)): A[ind[j], i] = np.random.uniform(-1, 1) elif kwargs['name'] == 'ho': ## rotation forest d = kwargs['target_dim'] ## find A by PCA elif kwargs['name'] == 'tomita': ## randomer forest d = kwargs['target_dim'] ## sample sparse A via very sparse rp density = 1 / (data.shape[1]**(1 / 2)) #default density value if 'density' in kwargs: if kwargs['density'] <= 1 and kwargs['density'] > 0: density = kwargs['density'] transformer = random_projection.SparseRandomProjection(n_components=d, density=density) transformer.fit(data) A = transformer.components_.copy() A = A.T ## A is SPARSE! else: ## dasgupta rp-tree d = 1 # default to a random vector if 'target_dim' in kwargs: d = kwargs['target_dim'] n_features = data.shape[1] A = np.zeros((data.shape[1], d)) # sample dense projection matrix for i in range(d): A[:, i] = np.random.normal(0, 1 / np.sqrt(n_features), n_features) return A
def sparandpro(X_train, y_train=None, X_test=None): from sklearn import random_projection mod = random_projection.SparseRandomProjection() X = mod.fit(X_train, y_train) test = mod.transform(X_train) if X_test is None: out = train else: test = pca.transform(X_test) out = train, test return out
def rp(self): n_target = self.n_target if n_target >= self.kf_data.shape[1]: return self.kf_data, self.test_data, self.kf_labels, self.test_labels rp = random_projection.SparseRandomProjection(n_components = n_target) high_data = np.concatenate((self.kf_data, self.test_data), 0) if self.test_data.any() else self.kf_data low_data = rp.fit_transform(high_data) kf_data_new = low_data[0:self.kf_data.shape[0]] if self.test_data.any(): test_data_new = low_data[self.kf_data.shape[0]:(self.kf_data.shape[0] + self.test_data.shape[0])] return kf_data_new, test_data_new, self.kf_labels, self.test_labels
def rp_data_gen_reverse(X_train, y_train, X_test, sample_portion=0.8, n_splits=15, thin_dim=1000, density='auto'): random_state = np.arange(1, n_splits + 1) samples = np.int(round(sample_portion * len(X_train))) # debugging print print(type(samples)) print("samples={}".format(samples)) # print ('samples number: {}'.format(samples)) transformers = [] X_train_thin_sets = [] X_test_thin_sets = [] X_valid_thin_sets = [] y_train_thin_sets = [] y_valid_thin_sets = [] for n in range(n_splits): # RP matrix generation trans = random_projection.SparseRandomProjection( n_components=thin_dim, density=density, random_state=random_state[n]) # transformers.append(trans) X_train_thin_temp = trans.fit_transform(X_train) X_test_thin_temp = trans.fit_transform(X_test) # # bootstrapping # select indexesm, ix means index ix = [i for i in range(len(X_train_thin_temp))] # resample returns new index for the new data train_ix = resample(ix, replace=True, n_samples=samples, random_state=random_state[n]) valid_ix = [x for x in ix if x not in train_ix] # select data X_train_thin, y_train_thin = X_train_thin_temp[train_ix], y_train.iloc[ train_ix] # testing is not necessay here, can be used as validation set validX, validy = X_train_thin_temp[valid_ix], y_train.iloc[valid_ix] X_train_thin_sets.append(X_train_thin) y_train_thin_sets.append(y_train_thin) X_valid_thin_sets.append(validX) y_valid_thin_sets.append(validy) # only tranform, no bootstrapping for X_test X_test_thin_sets.append(X_test_thin_temp) return X_train_thin_sets, X_valid_thin_sets, X_test_thin_sets, y_train_thin_sets, y_valid_thin_sets
def data_user_proj_data_diff(data, targ_dim): #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here. transformer = random_projection.SparseRandomProjection( n_components=targ_dim) #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R. projected_data = transformer.fit_transform(data) print( "\n\nnew data dimensions after projection according to user provided target data dimension: " + str(np.shape(projected_data))) #printing pdist() of projected data #print("pdist of points in projected data as per user provided target data dimension") #print(sp.pdist(projected_data)) print("\n\n") return sp.pdist(projected_data)
def low_dimensional_embedding(data_matrix, low_dim=None): n_rows, n_cols = data_matrix.shape # perform data dimension reduction only if #features > #data points if n_cols <= n_rows: return_data_matrix = data_matrix else: if n_rows < 5000: n_components = n_rows else: n_components = 'auto' transformer = random_projection.SparseRandomProjection(n_components=n_components, dense_output=True) data_matrix_new = transformer.fit_transform(data_matrix) basis_data_matrix, coordinates_data_matrix = matrix_factorization(data_matrix_new, n=low_dim) return_data_matrix = coordinates_data_matrix return return_data_matrix
def generate_model(trans_clf): rp_clf = random_projection.SparseRandomProjection(n_components=30) # clasificatior rf_clf = RandomForestClassifier(n_estimators=100, random_state=1, n_jobs=rf_n_jobs) # model pipeline model = Pipeline([("random projection", rp_clf), ("manifold trnasform", trans_clf), ("random_forest", rf_clf)]) return model
def reduce_dimensionality(n_components, train, test, method, attack=None): if method == 'PCA': matrix = PCA(n_components=n_components) elif method == 'RP': matrix = random_projection.SparseRandomProjection(n_components=n_components, random_state=7) else: print('unknown projection method, choose either RP or PCA') return None train = matrix.fit_transform(train) test = matrix.transform(test) if attack is None: return train, test attack = matrix.transform(attack) return train, test, attack
def fit(self, data_matrix): n_rows, n_cols = data_matrix.shape if n_rows <= n_cols: n_components = n_rows elif n_cols < 5000: n_components = n_cols else: n_components = 'auto' self.transformer = random_projection.SparseRandomProjection(n_components=n_components, dense_output=True, random_state=self.random_state) data_matrix_new = self.transformer.fit_transform(data_matrix) self.matrix_factorizer = pymf.SIVM(data_matrix_new.T, num_bases=self.complexity) self.matrix_factorizer.factorize() if self.n_kmeans: self.kmeans = MiniBatchKMeans(n_clusters=self.n_kmeans) self.kmeans.fit(self.matrix_factorizer.H.T)
def data_JL_proj_data_diff(data): n_row = len(data) #finding minimum dimension reduction possible using JL lemma, while preserving pairwise distances upto a given eps value. min_dim = random_projection.johnson_lindenstrauss_min_dim(n_row, eps=0.1) print("min dim suggested by JL lemma with eps = 0.1 is " + str(min_dim)) #creating transformer matrix to use for projecting the input data to target data. if O = IR. transformer is R here. transformer = random_projection.SparseRandomProjection() #transforming given "data"(input) to "projected_data"(output) by using "transformer" as random matrix R. projected_data = transformer.fit_transform(data) print( "new data dimensions after projection according to user provided target data dimension: " + str(np.shape(projected_data))) #printing pdist() of projected data #print("pdist of points in JL projected data") #print(sp.pdist(projected_data)) print("\n\n") return sp.pdist(projected_data)