def __call__(self, df, label_column): ''' Perform data activity here :param df: dataframe object :param label_column: string, name of the column :return: transformed dataframe object ''' self.label_column = label_column if not self.label_column: self.label_column = df.columns[-1] if self.validation: assert self.validate(df) df_copy = df.copy() label_values = df_copy[label_column] df_copy = df_copy.drop(label_column, axis=1) rp = None if self.proj_type == 'Gaussian': rp = random_projection.GaussianRandomProjection(self.n_components) elif self.proj_type == 'Sparse': rp = random_projection.SparseRandomProjection(self.n_components) rp.fit(df_copy) columns = [self.proj_type[:3]+'_%i' % i for i in range(self.n_components)] df_copy = pd.DataFrame(rp.transform(df_copy), columns=columns, index=df.index) df_copy[label_column] = label_values return df_copy
def read_file(folder, prefix, name): path = osp.join(folder, 'ind.{}.{}'.format(prefix.lower(), name)) if name == 'test.index': return read_txt_array(path, dtype=torch.long) with open(path, 'rb') as f: if sys.version_info > (3, 0): out = pickle.load(f, encoding='latin1') else: out = pickle.load(f) if name == 'graph': return out out = out.todense() if hasattr(out, 'todense') else out print('If input x has nan or inf', np.isinf(out).any(), np.isnan(out).any()) # for fast training, we discard one-hot encoding and use 32 dimension vector from gaussian distribution if prefix == 'ddi_constraint' or prefix == 'decagon': if name == 'allx': transformer = random_projection.GaussianRandomProjection( n_components=32) out = transformer.fit_transform(out) out = torch.FloatTensor(out) return out
def bow2random_projection(bow, eps=0.3, projection_type='sparse'): ''' INPUT bow: bag-of-words VxD numpy matrix type: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse OUTPUT proj: vxD matrix v << V ''' try: projection_type = projection_type.lower() if projection_type == 'gaussian': transformer = random_projection.GaussianRandomProjection(eps=eps) elif projection_type == 'sparse': transformer = random_projection.SparseRandomProjection(eps=eps) else: raise ValueError("only handles 'gaussian' or 'sparse'") resultT = transformer.fit_transform(bow.T) result = resultT.T except ex: result = None return result
def load_data(): # Load training data and vocab train_id_list, train_data_label, train_data_matrix, vocab = read_data( "data/train.csv") # Load testing data test_id_list, _, test_data_matrix, _ = read_data("data/test.csv", vocab) test_data_label = pd.read_csv("data/answer.csv")['label'] - 1 print("Vocabulary Size:", len(vocab)) print("Training Set Size:", len(train_id_list)) print("Test Set Size:", len(test_id_list)) K = max(train_data_label) + 1 # labels begin with 0 # Data random projection rand_proj_transformer = random_projection.GaussianRandomProjection( n_components=2000) # YOUR CODE HERE train_data_matrix = rand_proj_transformer.fit_transform(train_data_matrix) test_data_matrix = rand_proj_transformer.transform(test_data_matrix) print("Training Set Shape:", train_data_matrix.shape) print("Testing Set Shape:", test_data_matrix.shape) # Converts a class vector to binary class matrix. # https://keras.io/utils/#to_categorical train_data_label = keras.utils.to_categorical(train_data_label, num_classes=K) test_data_label = keras.utils.to_categorical(test_data_label, num_classes=K) return train_data_matrix, train_data_label, test_data_matrix, test_data_label
def makeSpeakerGridPlots(sarcasmDf, bertFeats=None, show=False): tformFile = './data/transformData.pkl' if bertFeats is None: with open(tformFile, 'rb') as ifile: dataMap = pkl.load(ifile) else: print('Regenerating transform data...') dataMap = { 'PCA': PCA().fit_transform(bertFeats), 'TSNE': TSNE().fit_transform(bertFeats), 'Agglomeration': FeatureAgglomeration().fit_transform(bertFeats), 'Gaussian Projection': random_projection.GaussianRandomProjection(2).fit_transform( bertFeats), 'Sparse Projection': random_projection.SparseRandomProjection(2).fit_transform( bertFeats) } with open(tformFile, 'wb') as ofile: pkl.dump(dataMap, ofile) for combo in ('speaker', 'sarcasm'), ('sarcasm', 'speaker'): for tform in dataMap: tfData = dataMap[tform] grid = makeDataPlots(tfData, sarcasmDf, *combo, tform) if show: grid.show() title = grid.windowTitle() saveGrid(grid, imgDir / f'{title}.jpg')
def bow2rnd_proj(bow, projection_type='sparse', eps=0.3): ''' INPUT bow: bag-of-words VxD numpy matrix projection_type: Gaussian for gaussian projection OR Sparse for Achiloptas projection default: Sparse eps: threshold for acceptable distorsions higher eps -> higher theoretical probability of distorsions is bounded between 0-1 OUTPUT rnd_proj: vxD matrix v << V ''' try: projection_type = projection_type.lower() if projection_type == 'gaussian': transformer = random_projection.GaussianRandomProjection(eps=eps) elif projection_type == 'sparse': transformer = random_projection.SparseRandomProjection(eps=eps) else: raise ValueError("only handles 'gaussian' or 'sparse'") resultT = transformer.fit_transform(bow.T) result = resultT.T except ex: result = None return result
def gaussData(dPath): df=pd.read_csv(dPath) df=df.fillna(0) data = df.iloc[:,:].values transformer = random_projection.GaussianRandomProjection(n_components=2, eps=0.1, random_state=None) transformedData = transformer.fit_transform(data) return transformedData
def optimize_components(X, feature_names, label, abbrev, chosen_n_components): # model selection: choose optimal number of components by reconstruction error n_components = np.arange(1, len(feature_names) + 1) rp_scores = [] for n in n_components: rp = random_projection.GaussianRandomProjection(n_components=n, random_state=SEED) reduced = rp.fit_transform(X) rp_scores.append(get_reconstruction_error(X, reduced, rp)) print(label + ": n_components with lowest RP reconstruction error = %d" % n_components[np.argmin(rp_scores)]) print(label + ": chosen n_components by RP reconstruction error = %d" % chosen_n_components) print(label + ": chosen n_components' reconstruction error = " + str(rp_scores[chosen_n_components])) # create plot plt.figure() plt.plot(n_components, rp_scores, 'b', label='RP reconstruction error') plt.axvline(chosen_n_components, color='b', label='RP components: %d' % chosen_n_components, linestyle='--') # format plot ax = plt.gca() ax.xaxis.set_major_locator(MaxNLocator(integer=True)) plt.xlabel('number of components') plt.ylabel('reconstruction error') plt.legend(loc='lower right') plt.title(label + ": RP model selection") plt.savefig(path.join(PLOT_DIR, abbrev + "_rp_components.png"), bbox_inches='tight') plt.show() plt.close() return chosen_n_components
def run_randomized_components_analysis(input_data, target_data): #split our data first X_sc_train, X_sc_test, y_train, y_test = train_test_split(input_data, target_data, test_size=0.33, random_state=42) #set baseline lr = LogisticRegression() lr.fit(X_sc_train, y_train) baseline_preds = lr.predict(X_sc_test) baseline = accuracy_score(y_test, baseline_preds) #loop over n_components to test randomized projections to see which is best accuracies = [] for i in range(1, len(X_sc_train[0]) + 1): transformer = random_projection.GaussianRandomProjection( n_components=i, random_state=5000000) X_new = transformer.fit_transform(X_sc_train) lr_rand = LogisticRegression() lr_rand.fit(X_new, y_train) test_data = transformer.transform(X_sc_test) new_preds = lr_rand.predict(test_data) accuracies.append(accuracy_score(y_test, new_preds)) return baseline, accuracies
def random_proj_gaussian_random(X, n_comp): rp = random_projection.GaussianRandomProjection(n_components=n_comp, random_state=42) X_projected = rp.fit_transform(X) del rp return X_projected
def transform_bag_of_words(filename, n_dimensions, out_fn): import gzip import sklearn.model_selection from scipy.sparse import lil_matrix from sklearn.feature_extraction.text import TfidfTransformer from sklearn import random_projection with gzip.open(filename, 'rb') as f: file_content = f.readlines() entries = int(file_content[0]) words = int(file_content[1]) file_content = file_content[3:] # strip first three entries print("building matrix...") A = lil_matrix((entries, words)) for e in file_content: doc, word, cnt = [int(v) for v in e.strip().split()] A[doc - 1, word - 1] = cnt print("normalizing matrix entries with tfidf...") B = TfidfTransformer().fit_transform(A) print("reducing dimensionality...") C = random_projection.GaussianRandomProjection( n_components=n_dimensions).fit_transform(B) X_train, X_test = sklearn.model_selection.train_test_split( C, test_size=10000, random_state=1) print('writing output...') write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
def part4( dataset ): print("PART 4 - "+dataset['name']) X = scale(dataset['X']) labels = dataset['y'] script = scripts[dataset['name']] print("FULL NN") PlotClassifiers(X, labels, 'best', dataset['name']+":FULL", dataset['classes']) print("PCA NN") pca = PCA(n_components=script['pca']) projected = pca.fit_transform(X) PlotClassifiers(projected, labels, 'best', dataset['name']+":PCA", dataset['classes']) print("ICA NN") ica = FastICA(n_components=script['ica']) projected = ica.fit_transform(X) PlotClassifiers(projected, labels, 'best', dataset['name']+":ICA", dataset['classes']) print("RP NN") transformer = random_projection.GaussianRandomProjection(script['rp']) projected = transformer.fit_transform(X) PlotClassifiers(projected, labels, 'best', dataset['name']+":RP", dataset['classes']) print("LDA NN") transformer = LinearDiscriminantAnalysis(n_components=script['lda']) projected = transformer.fit_transform(X, labels) PlotClassifiers(projected, labels, 'best', dataset['name']+":LDA", dataset['classes'])
def cluster_nn(name, t_x, t_y, v_x, v_y): if name == 'kmeans': cluster = KMeans(n_clusters=4, random_state=0) elif name == 'em': cluster = GaussianMixture(n_components=2, covariance_type='full') print("cluster nn") model = neural_network.MLPClassifier(hidden_layer_sizes=(5,5)) comp = [2, 4, 6, 8] methods = ['PCA', 'ICA', 'RP'] file = open(name + "cluster_nn.csv", "w") result = "" result_v = "" for j in comp: print(j) for name in methods: temp = [] temp_v = [] if name == 'RP': iters = 20 else: iters = 1 for it in range(iters): if name == 'PCA': method = PCA(n_components=j) elif name == 'ICA': method = FastICA(n_components=j) elif name == 'RP': method = random_projection.GaussianRandomProjection(n_components=j) t_x_reduced = method.fit_transform(t_x) v_x_reduced = method.fit_transform(v_x) cluster.fit(t_x_reduced) clustered = cluster.predict(t_x_reduced) clustered_v = cluster.predict(v_x_reduced) clustered = clustered.reshape(clustered.shape[0], 1) clustered_v = clustered_v.reshape(clustered_v.shape[0], 1) t_x_new = np.hstack([t_x_reduced, clustered]) v_x_new = np.hstack([v_x_reduced, clustered_v]) model.fit(t_x_new, t_y) acc = metrics.accuracy_score(t_y, model.predict(t_x_new)) acc_v = metrics.accuracy_score(v_y, model.predict(v_x_new)) temp.append(acc) temp_v.append(acc_v) result += str(np.mean(temp)) + ", " result_v += str(np.mean(temp_v)) + ", " result += "\n" result_v += "\n" file.write(result) file.write(result_v) file.close()
def rp(X, c): clf = random_projection.GaussianRandomProjection(n_components=c) X_rp = clf.fit_transform(X) #for i in range(0,1): #X_rp = clf.fit_transform(X_rp) #print(clf.components_) #print(X_pca.shape) return X_rp
def getguassianprojections(features, n_components='auto'): features_reshaped = features.reshape(features.shape[0], -1) X = features_reshaped transformer = random_projection.GaussianRandomProjection( n_components=n_components) X_new = transformer.fit_transform(X) print(X_new.shape) return X_new
def GaussianRandomProjection(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = random_projection.GaussianRandomProjection(n_components=2) result = {} result['data'] = pca.fit_transform(data_source) result['params'] = pca.dense_output #-错误 return result
def randomProjection(data, labels, new_dimension): print ("start random projection...") start = time.time() transformer = random_projection.GaussianRandomProjection(n_components=new_dimension) reduced = transformer.fit_transform(data) end = time.time() #print (" took %f" % (end - start)) return (reduced, end-start)
def test_ANN_RP(data_X, data_Y, filename, est_name, NUM_ATTR=15): for NUM_ATTR in [6, 11]: #range(11, data_X.shape[1]+1): for i in range(5): rp = random_projection.GaussianRandomProjection( n_components=NUM_ATTR) reduced_RP = rp.fit_transform(data_X) select_comp_supervised(reduced_RP, data_Y, filename, NUM_ATTR, est_name)
def dim_red_comparison(X_train, y_data, num_comps, verbose=True): ''' Reduces dimensionality of original dataset to a predefined number of components. Different methods are used: PCA, KPCA, Random Projections, and LDA. The efficacy of the reduction can be assesed via the classification performance with a learner. Parameters ========== X_train: pandas df. Original feature data, does not have to be split for supervised learning at this stage. Data must be encoded and normalized before doing dimensionality reduction. y_data: pandas df. Original label data. Must be encoded. Only used for LDA. num_comps: number of dimensions to reduce original features. Returns ========== X_pca, X_kpca, X_rp, X_lda: reduced matrices of size (N, num_comps) for each of the reduction methods. feats_rank_name: if verbose=True writes csv with the importance of the original features in the reduction for the PCA method. We cannot achieve the correspondance with the other nonlinear methods but PCA gives a good idea. ''' # pca pca = PCA(n_components=num_comps) X_pca = pca.fit_transform(X_train) # kernelized pca k_pca = KernelPCA(n_components=num_comps, kernel="rbf", fit_inverse_transform=True, gamma=10) X_kpca = k_pca.fit_transform(X_train) # transform back # X_train_kpca_bck = k_pca.inverse_transform(X_kpca) # random projections rand_p = random_projection.GaussianRandomProjection(n_components=num_comps) X_rp = rand_p.fit_transform(X_train) # now do LDA (this is a supervised method for dim red) lda = LinearDiscriminantAnalysis(n_components=num_comps) X_lda = lda.fit(X_train, y_data).transform(X_train) # only pca can give us the importance in the original space because it is # a linear combination if verbose == True: pc_importance = pca.explained_variance_ratio_ feats_rank = np.argmax(np.abs(pca.components_),axis=1) feats_rank_name = pd.DataFrame(X_train.columns[feats_rank].tolist()) feats_rank_name = pd.concat([feats_rank_name, pd.DataFrame(pc_importance)*100], axis=1) feats_rank_name.columns = ['feat name', 'PCA imp weight'] feats_rank_name.to_csv('pca_feats_rank_name.csv') return X_pca, X_kpca, X_rp, X_lda
def r_projection(input_data, no_components=None, e=0.1): if no_components == None: no_components = johnson_lindenstrauss_min_dim( n_samples=input_data.shape[0], eps=e) projected_data = random_projection.GaussianRandomProjection( n_components=no_components).fit_transform(input_data) return projected_data
def gen_random_projection(frame_array, new_size): frame_array_reshaped = frame_array.reshape(-1, frame_array.shape[2]) transformer = random_projection.GaussianRandomProjection( n_components=new_size, random_state=1) projected_frame_array = transformer.fit_transform(frame_array_reshaped) projected_frame_array = projected_frame_array.reshape( frame_array.shape[0], frame_array.shape[1], -1) return (projected_frame_array)
def project(X,dim = 32,loop = 10000): T = random_projection.GaussianRandomProjection(n_components=dim) X_new = [] for i in range(0,X.shape[0],loop): X_new.append(T.fit_transform(X[i:i+loop])) X_new = np.vstack(X_new) return X_new
def __init__(self, maxcomponents=5, ncomponents=2): super().__init__() self.name = 'Gaussian random projections' self.ncomponents = ncomponents self.maxcomponents = maxcomponents self.model = random_projection.GaussianRandomProjection(n_components=ncomponents) self.takes_label = False
def fit(self, X): """ Create random unit vectors and index X :param X: sparse csc matrix of samples :return: """ self.indexer.init(self.n_indices) self.random_unit_vectors = random_projection.GaussianRandomProjection( n_components=self.n_indices) self.random_unit_vectors.fit(X) self.partial_fit(X)
def randne_projection(A, q=3, dim=128): transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42) # Random projection for A cur_U = transformer.fit_transform(A) U_list = [cur_U] for i in range(2, q + 1): cur_U = A @ cur_U U_list.append(cur_U) return U_list
def grp(X, C=100): """ Gaussian Random Projection (GRP): Projection of X into C dimensions. """ print "GRP..." print X.shape print("Computing GaussianRandomProjection, using %3d components" % C) transformer = random_projection.GaussianRandomProjection(n_components=C) X_grp = transformer.fit_transform(X) print X_grp.shape return X_grp
def gaurandpro(X_train, y_train=None, X_test=None): from sklearn import random_projection mod = random_projection.GaussianRandomProjection() X = mod.fit(X_train, y_train) test = mod.transform(X_train) if X_test is None: out = train else: test = pca.transform(X_test) out = train, test return out
def reduce_dimension(D, projection='mds'): projections = {'mds' : manifold.MDS(2, dissimilarity="precomputed"), 'tsne' : manifold.TSNE(2, metric="precomputed"), 'gaussianrp': random_projection.GaussianRandomProjection(2), 'spectralembedding': manifold.SpectralEmbedding(2), 'pca': PCA(2), 'umap': umap.UMAP(n_components=2, metric='precomputed') } X = projections[projection].fit_transform(D) return X
def rca(self, n_components=2): """ Reduce dimensionality through Gaussian random projection The components of the random matrix are drawn from N(0, 1 / n_components). >>> X = np.random.rand(100, 10000) >>> transformer = random_projection.GaussianRandomProjection() >>> X_new = transformer.fit_transform(X) """ rca = random_projection.GaussianRandomProjection( n_components=n_components) X_trans = rca.fit_transform(self.xs) return X_trans
def rp(data, type): # Run randomized projection on data filename_template = "nba_{type}_rp_transformed_{dimension}d_matrix.npy" iteration = 50 n_components_min = 2 n_components_max = 20 n_components = np.arange(n_components_min, n_components_max, 1) x_value = np.repeat(n_components, iteration) distortion_array = np.array([]) least_distortion = float('Inf') least_distortion_dimension = 0 best_transformed_data = np.array([]) origin_dist_matrix = np.asarray([[la.norm(u - v) for v in data] for u in data]) def calculate_distortion(transformed_data): size = transformed_data.shape[0] max_distortion = float('-inf') for u in range(size): for v in range(size): if v < u: origin_dist = origin_dist_matrix[u,v] transformed_dist = la.norm(transformed_data[u] - transformed_data[v]) distortion = (transformed_dist / origin_dist) ** 2 if distortion > max_distortion: max_distortion = distortion return max_distortion for n in n_components: print n for i in range(iteration): rp = random_projection.GaussianRandomProjection(n_components=n,eps=0.1) transformed_data = rp.fit_transform(data) distortion = calculate_distortion(transformed_data) distortion_array = np.append(distortion_array, distortion) if distortion < least_distortion: least_distortion = distortion best_transformed_data = transformed_data least_distortion_dimension = n # print "# of components: %r" % best_transformed_data.shape[1] # print "least_f_norm_percent_change is %.2f%%" % least_f_norm_percent_change filename = filename_template.format(type = type, dimension=str(least_distortion_dimension)) np.save(filename,best_transformed_data) plt.figure(figsize=(16, 9)) plt.scatter(x_value, distortion_array, marker='+') plt.xticks(np.arange(n_components_min-1,n_components_max+1,1)) plt.grid(True) plt.xlabel("# of components") plt.ylabel("Distortion") note = "Least distortion: %.2f" % (least_distortion) notex, notey = best_transformed_data.shape[1], least_distortion plt.title("NBA Players Stats, Randomized Projects %s\n %r iterations for each # of components" % (type, iteration)) plt.annotate(note, xy=(notex ,notey), xytext=(notex + 0.2,notey + 0.2), wrap=True, arrowprops=dict(facecolor='black', shrink=0.005)) plt.savefig(("random_projection_distortion_%s.png") % type) plt.close()