def preprocess(Xtr, Xvl, use_pca, max_pca_components=None): """ The data preprocessing Xtr - the training data features Xvl - the test data features use_pca - whether to use PCA for feature space reduction max_pca_components - the maximal number of PCA components to extract return preprocessed features """ if use_pca: if max_pca_components == None: raise "Please specify maximal number of PCA components to extract" #scaler = decomposition.RandomizedPCA(n_components=max_features) scaler = decomposition.SparsePCA(n_components=max_pca_components) print 'PCA max features to keep: %d' % (max_pca_components) Xtr = scaler.fit_transform( Xtr ) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) Xvl = scaler.transform(Xvl) else: scaler = StandardScaler(copy=False) # scale only first column 'SUBJID' xtr_subj = Xtr[:, :1] xvl_subj = Xvl[:, :1] xtr_subj = scaler.fit_transform( xtr_subj ) # fit only for train data (http://cs231n.github.io/neural-networks-2/#datapre) xvl_subj = scaler.transform(xvl_subj) print 'Train data mean: %f, variance: %f' % (Xtr.mean(), Xtr.std()) print 'Test data mean: %f, variance: %f' % (Xvl.mean(), Xvl.std()) return Xtr, Xvl
def PCA(self, X, Y=None, ncomp=2, method='PCA'): """ decompose a multivariate dataset in an orthogonal set that explain a maximum amount of the variance @param X: Input dataset Keyword Arguments: ncomp -- number or components to be kept (Default: 2) method -- method to be used PCA(default)/Randomized/Sparse """ from sklearn import decomposition from sklearn import cross_decomposition if method == 'Randomized': pca = decomposition.RandomizedPCA(n_components=ncomp) elif method == 'Sparse': pca = decomposition.SparsePCA(n_components=ncomp) elif method == 'rbf': pca = decomposition.KernelPCA(n_components=ncomp, fit_inverse_transform=True, gamma=10, kernel="rbf") elif method == 'linear': pca = decomposition.KernelPCA(n_components=ncomp, kernel="linear") elif method == 'sigmoid': pca = decomposition.KernelPCA(n_components=ncomp, kernel="sigmoid") elif method == 'SVD': pca = decomposition.TruncatedSVD(n_components=ncomp) else: pca = decomposition.PCA(n_components=ncomp) method = 'PCA' print('[ML] Using %s method' % method) pca.fit(X) return pca.transform(X)
def _train(self, train_data, params, verbose): import sklearn.decomposition as sk_dec if verbose: print("Training {} ...".format(self.name)) start_time = time.time() try: covs = [] for x in train_data: est = sk_dec.SparsePCA(n_components=params['n_components'], alpha=params['alpha'], ridge_alpha=params['ridge_alpha'], max_iter=params['max_iter'], tol=params['tol']) est.fit(x) # get covariance: \Psi + \Lambda.T * \Sigma_{zz} * \Lambda z = est.transform(x) cov_z = np.cov(z.T) var_x = np.var(x, axis=0) cov = np.dot(est.components_.T, np.dot(cov_z, est.components_)) np.fill_diagonal(cov, var_x) covs.append(cov) except Exception as e: covs = None if verbose: print(f"\t{self.name} failed with message: {e}") finish_time = time.time() if verbose: print("\tElapsed time {:.1f}s".format(finish_time - start_time)) return covs, None
def PreprocessingSparsePCA(self, PCA_coefficients, MNE_coefficients, N_neighbors): """ :type MNE_coefficients: int :type PCA_coefficients: int :param MNE_coefficients: number of coefficnents for mns projection :param PCA_coefficients: number of n_coefficients for PCA transform :param N_neighbors: number of neighbors for embedding """ self.MNE_coefficients = MNE_coefficients self.PCA_coefficients = PCA_coefficients self.N_neighbors = N_neighbors self.pca = decomposition.SparsePCA(n_components=self.PCA_coefficients, alpha=0.5, ridge_alpha=0.01, max_iter=1000, tol=1e-06, method='lars', n_jobs=-1, U_init=None, V_init=None, verbose=False, random_state=0) self.Embedding = manifold.SpectralEmbedding(n_components=self.MNE_coefficients, affinity='nearest_neighbors', gamma=None, random_state=0, n_neighbors=self.N_neighbors) self.X_pca = self.pca.fit_transform(self.Waves_Coefficients) self.X_red = self.Embedding.fit_transform(self.X_pca) return self.X_red
def create_components_feat(self, components=[1, 2, 3]): spca = decomposition.SparsePCA(alpha=0.02) length = self.shape[1] print(self) data = spca.fit_transform(self) for comp in components: self['PCA_comp_' + str(comp)] = data[:, comp]
def SparsePCA(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = decomposition.SparsePCA(n_components=2) print(pca.error_) #不一定对的得得得得得 result = {} result['data'] = pca.fit_transform(data_source) result['params'] = 0 return result
def SparsePCA(array, percent_samples): print "Sparse PCA", percent_samples * 100, "% of training data." print "Features\tTime" array = array[:int(percent_samples * len(array))] for pct in pct_features_list: num_features = int(pct * len(array[0])) start = time() Y = decomposition.SparsePCA( n_components=num_features).fit_transform(array) end = time() print num_features, "\t", (end - start)
def DimensionReduction(filename, vector_list, reducedDim): print "PCA starts" print "reduced dimension :", reducedDim start = time.time() pca = decomposition.SparsePCA(reducedDim) result = pca.fit_transform(vector_list) np.savetxt("data/reducedVec_" + filename + "_dim" + str(reducedDim) + ".csv", result, delimiter=",") elapsed_time = time.time() - start print "PCA finish : %s [min]" % (elapsed_time / 60) return result
def reduce_dimensionality(df, distances, dim_type): print 'reducing dimensionality' if dim_type == 'mds': # convert two components as we're plotting points in a two-dimensional plane # "precomputed" because we provide a distance matrix # we will also specify 'random_state' so the plot is reproducible. mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=1) pos = mds.fit_transform(distances) # shape (n_components, n_samples) if dim_type == 'isomap': pos = manifold.Isomap(n_neighbors=10, n_components=2).fit_transform(distances) if dim_type == 'pca': pca = decomposition.PCA(n_components=2, random_state=1) pos = pca.fit_transform(distances) if dim_type == 'sparsepca': pca = decomposition.SparsePCA(n_components=2, random_state=1) pos = pca.fit_transform(distances) if dim_type == 'tsne': tsne = manifold.TSNE(n_components=2, init='pca', random_state=1) pos = tsne.fit_transform(distances) if dim_type == 'spectral': se = manifold.SpectralEmbedding(n_components=2, n_neighbors=10, random_state=1) pos = se.fit_transform(distances) if dim_type == 'lle': methods = ['standard', 'ltsa', 'hessian', 'modified'] lle = manifold.LocallyLinearEmbedding(n_neighbors=10, n_components=2, method=methods[1], random_state=1) pos = lle.fit_transform(distances) if dim_type == 'trunc': tsvd = decomposition.TruncatedSVD(n_components=2, random_state=1) pos = tsvd.fit_transform(distances) xs, ys = pos[:, 0], pos[:, 1] names = df.index.values for x, y, name in zip(xs, ys, names): plt.scatter(x, y, s=5) # plt.text(x, y, name) # can be overwhelming with lots of data points plt.show() return xs, ys, names
def choose_decomposition_method(method, n_components): """Return the decomposition corresponding to `method`.""" if method == 'PCA': return decomposition.PCA(n_components) elif method == 'Randomized PCA': return decomposition.RandomizedPCA(n_components) elif method == 'Kernel PCA': return decomposition.KernelPCA(n_components, kernel='rbf') elif method == 'Sparse PCA': return decomposition.SparsePCA(n_components, n_jobs=1) elif method == 'SVD': return decomposition.TruncatedSVD(n_components) elif method == 'Factor Analysis': return decomposition.FactorAnalysis(n_components) elif method == 'ICA': return decomposition.FastICA(n_components) raise ValueError('{} is not a known method'.format(method))
def SparsePCA(self, n_comps=[]): if n_comps == []: n_comps = self.datatrain.shape[1] self.sparsecomp = decomp.SparsePCA() self.sparsecomp.fit(self.datatrain) # if n_comps==self.numdims: # scree = np.vstack((np.array(range(len(self.princomp.explained_variance_))),self.princomp.explained_variance_)) # X2=scree[:,0] # X1=scree[:,-1] # distance = np.dot((X2*np.ones([len(self.princomp.explained_variance_),2]))-scree.transpose(),(X1*np.ones([len(self.princomp.explained_variance_),2])-scree.transpose()).transpose()) # distance=distance/np.dot(X2-X1,(X2-X1).transpose()) # distance=np.diag(distance) # self.princomp.n_components=min(enumerate(distance),key=itemgetter(1))[0]+1 self.sparsecompscores = [ self.sparsecomp.transform(self.datatrain), self.sparsecomp.transform(self.dataval), self.sparsecomp.transform(self.datatest) ]
# We drop the columns that are not interesting for us, and the row with no label data.select_good_columns(columns_to_delete + sub_diagnosis_id) data.preprocessing(label_id) if NORMALIZE: data.normalize_age(label_age, label_gender) if MISSING_VALUE_STRATEGY == 'Binary': # We create the binary columns data.create_missing_data_col() if MISSING_VALUE_STRATEGY in ['Replacement', 'Binary']: # We replace missing values in the ADOS answers (8) by 0 data.replace(8, 3, inplace=True) pca = decomposition.PCA() spca = decomposition.SparsePCA(alpha=0.02) kpca = decomposition.KernelPCA(kernel='cosine') kpca2 = decomposition.KernelPCA(kernel='sigmoid') pca_plot(pca, 1) plt.figure(4) plt.plot(pca.explained_variance_ratio_) pca_plot(spca, 5) pca_plot(kpca, 8, show_coef=False) pca_plot(kpca2, 10, show_coef=False) plt.show()
import heapq import sys, string from numpy import loadtxt import os import os.path global test_list import matplotlib.pyplot as plt from sklearn import decomposition from sklearn.decomposition import SparsePCA lis = [] def strip_first_col(fname, delimiter=None): with open(fname, 'r') as fin: for line in fin: try: yield line.split(delimiter, 1)[1] except IndexError: continue X = np.loadtxt(strip_first_col( "/Volumes/MyPassport/Leucegene/Leucegene_Results/AlternativeOutput/ExpressionInput/exp.LeucegeneSplicing.txt" ), skiprows=1) X = zip(*X) X = np.array(X) sparsepc = decomposition.SparsePCA() sparsepc.fit(X) print("Optimal number of features : %d" % sparsepc.components_)
# load the nifti data masker = NiftiMasker(mask_img='../data/playground/overlap_mask_3mm.nii.gz') subs_2d = masker.fit_transform('../data/playground/subs_3mm.nii.gz') for csv in glob('../data/raw/subs-405*.csv'): behav_df = pd.read_csv(csv, converters={'id': lambda x: str(x).zfill(4)}) behav_df.set_index('id', inplace=True) behav_df X_train, X_test, y_train, y_test = train_test_split( subs_2d, behav_df['score'].values, test_size=0.10) # preprocessing strategy # PCA pca = dcm.PCA(svd_solver='full') spca = dcm.SparsePCA() sparse_alpha_opts = [0.1, 0.5, 1, 2, 5, 10] kpca = dcm.KernelPCA() kernel_opts = ["linear", "rbf", "sigmoid"] n_component_opts = [0.7, 0.8, 0.9, 0.95, 0.99] lasso = linear_model.LassoCV(max_iter=100000, n_jobs=28, alphas=np.arange(0.1, 50, 0.1)) lasso_lars = linear_model.LassoLarsCV(n_jobs=28, max_iter=100000, max_n_alphas=10000) eps_opts = [10.0, 5.0, 2.0, 1.5, 0.9, 0.1, 0.01, 0.001, 0.0001] elastic = linear_model.ElasticNetCV(alphas=np.arange(0.1, 50, 0.1), max_iter=100000) l1_ratio_opts = [0.1, 0.5, 0.9, 0.95, 0.99] lasso_lars_bay = linear_model.LassoLarsIC(max_iter=100000)
]) #normalize them patch_scaler = preprocessing.StandardScaler() stimuli_patches = patch_scaler.fit_transform(stimuli_patches) #%% nfeat = 15 rpca = decomposition.RandomizedPCA(n_components=nfeat, whiten=True) rpca.fit(unlagged_stimuli) unlagged_stimuli = rpca.transform(unlagged_stimuli) #%% #sparse pca spca = decomposition.SparsePCA(n_jobs=-1) spca.fit(unlagged_stimuli) unlagged_stimuli = spca.transform(unlagged_stimuli) #%% #dictionary minibatch mbdic = decomposition.MiniBatchDictionaryLearning(n_components=50, verbose=True) mbdic.fit(stimuli_patches) #%% #visualize V = mbdic.components_ plt.figure()
def generate_transformers(x, dataset, global_dir, min_variance=10, additional_scale_tsvd=1): """ This function returns a dictionary with callables for a given dataset. """ transform_functions = { 'vae': (lambda x: transform_vae(x, VAE_net)), 'pca': (lambda x: transform_pca(x, pca, var_pca)), 'tsvd': (lambda x: transform_tsvd(x, tsvd)), 'kpca': (lambda x: transform_kpca(x, kpca)), 'spca': (lambda x: transform_spca(x, spca)), 'iso': (lambda x: transform_iso(x, iso)), 'lle': (lambda x: transform_lle(x, lle)), } """ Note that below, we could have dynamically generated most transformer functions. However, doing so would potentially lose overview, and we do not have to optimize for efficiency here, while we actually have to preserve readability. """ ################ Regular PCA ################ pca = decomposition.PCA(n_components=2) var_pca = np.var(pca.fit_transform( x)) # We do this in one call, since we don't need latent_X for now # print(np.sum(pca.explained_variance_ratio_)) # Could be interesting to explain results with def transform_pca(x, pca, var_pca): return np.matmul(x, np.transpose( pca.components_)) / math.sqrt(var_pca) * math.sqrt(min_variance) ################ Truncated SVD ################ tsvd = decomposition.TruncatedSVD(n_components=2, n_iter=7, random_state=42) var_tsvd = np.var(tsvd.fit_transform(x)) def transform_tsvd(x, tsvd): return np.matmul(x, np.transpose(tsvd.components_)) / math.sqrt( var_tsvd) * math.sqrt(min_variance) * additional_scale_tsvd ################ Kernel PCA ################ kpca = decomposition.KernelPCA(n_components=2, kernel="sigmoid", fit_inverse_transform=True, gamma=None, random_state=42) var_kpca = np.var(kpca.fit_transform(x)) if 0. in kpca.lambdas_: # KPCA with Sigmoid kernel does not work for this set del transform_functions['kpca'] def transform_kpca(x, kpca): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return kpca.transform(x) / math.sqrt(var_kpca) * math.sqrt( min_variance) ################ Sparse PCA ################ spca = decomposition.SparsePCA(n_components=2, alpha=0.0001, random_state=42, n_jobs=-1) var_spca = np.var(spca.fit_transform(x)) def transform_spca(x, spca): return np.matmul(x, np.transpose( spca.components_)) / math.sqrt(var_spca) * math.sqrt(min_variance) ################ ISO ################ iso = manifold.Isomap(n_neighbors=8, n_components=2, eigen_solver='dense') var_iso = np.var(iso.fit_transform(x)) def transform_iso(x, iso): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return iso.transform(x) / math.sqrt(var_iso) * math.sqrt(min_variance) ################ LLE ################ lle = manifold.LocallyLinearEmbedding(n_neighbors=8, n_components=2, eigen_solver='dense') var_lle = np.var(lle.fit_transform(x)) def transform_lle(x, lle): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) return lle.transform(x) / math.sqrt(var_lle) * math.sqrt(min_variance) ################ SCVIS VAE ################ VAE_save_file = global_dir + "/results/vae_models/" + dataset + ".pt" if not os.path.isfile(VAE_save_file): # Auto-encoder needs to be trained on the model first print('Training new VAE model on %s dataset' % dataset) trainVAE( x, global_dir, dataset ) # normalizing using np.max(np.abs(x)) not necessary as it equals 1 # Once trained, it loads existing model, also for reproducability VAE_model = torch.load(VAE_save_file)['model_state_dict'] print('Loaded VAE model for %s dataset' % dataset) VAE_net = VAE(input_dim=x.shape[1], latent_dim=2) VAE_net.load_state_dict(VAE_model) VAE_net.eval() def transform_vae(x, VAE_net): x = np.array(x) if len(x.shape) == 1: x = x.reshape(1, -1) with torch.no_grad(): x_batch = torch.from_numpy(x).float() encoder_mu, encoder_log_var = VAE_net.encoder(x_batch, p=1.0) batch_z = VAE_net.sampling(encoder_mu, encoder_log_var, batch_size=len(x), eval=True).numpy() return np.array(batch_z, dtype=float) return transform_functions
train_df = pd.read_csv(os.path.join(train_dir, 'train.csv'), index_col='ID') # Transforming target variable to normal distribution train_df['target'] = stats.boxcox(train_df['target'])[0] # Scaling train data print('Scaling Train Data') std_scale = preprocessing.StandardScaler().fit(train_df.iloc[:, 1:]) train_df_scaled = std_scale.transform(train_df.iloc[:, 1:]) # Fitting and transforming train data with sparse PCA print('Fitting and transforming Train Data with Sparse PCA algorithm') n_components = args.n_components sparse_sm = decomposition.SparsePCA(n_components=n_components) sparse_train = sparse_sm.fit_transform(train_df_scaled) train_PCA_output_path = os.path.join('/opt/ml/processing/train', 'train_sparse_pca.csv') test_PCA_output_path = os.path.join('/opt/ml/processing/test', 'test_sparse_pca.csv') # Saving transformed train data pd.concat([ train_df['target'], pd.DataFrame( sparse_train, columns=['c{}'.format(num + 1) for num in range(n_components)], index=train_df.index) ],
# Projection on to the first 2 principal components print("Computing Kernel PCA projection") t0 = time() kpca = decomposition.KernelPCA(n_components=n_com) X_kpca = kpca.fit_transform(X) plot_embedding( X_kpca, "Kernel Principal Components projection of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Projection on to the first 2 principal components print("Computing Sparce PCA projection") t0 = time() spca = decomposition.SparsePCA(n_components=n_com) X_spca = spca.fit_transform(X) plot_embedding( X_pca, "Sparce Principal Components projection of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Isomap projection of the digits dataset print("Computing Isomap embedding") t0 = time() iso = manifold.Isomap(n_neighbors, n_components=n_com) X_iso = iso.fit_transform(X) print("Done.") plot_embedding(X_iso, "Isomap projection of the digits (time %.2fs)" % (time() - t0))
def __init__(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = decomposition.SparsePCA(n_components=2) self.return_data = pca.fit_transform(data_source)
Y_TRAIN = traindata[:len(traindata), -1] X_TEST = testdata[:, 2 : -1] Y_TEST = testdata[:, -1] max_score = 0 num_components = 0 ''' Initialize sequence of estimators: Data pre-processing Feature selection Classifier ''' estimators = [] estimators.append(('scaler', preprocessing.StandardScaler())) estimators.append(('FeatureSelection', decomposition.SparsePCA())) estimators.append(('clf', SGD = SGDClassifier(loss='log', penalty='elasticnet', n_iter=100))) ''' Form a pipeline from the estimators ''' model = Pipeline(estimators) ''' Set hyper parameters for grid search ''' parameters = { 'FeatureSelection__n_components': (4, 8, 12, 16), 'FeatureSelection__alpha': (0.1, 1, 10, 100), 'clf__alpha': (0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001), 'clef__l1_ratio': (0, 0.15, 0.5, 0.85, 1), 'clf__n_iter': (10, 50, 80, 250) }
all_projections['LAMP'] = (vp.LAMP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'sample_type': ['random', 'clustering_centroid']}) all_projections['LE'] = (manifold.SpectralEmbedding(), {'n_components': [2], 'affinity': ['nearest_neighbors'], 'random_state': [42]}) all_projections['LISO'] = (vp.LandmarkIsomap(), {'verbose': [False], 'n_neighbors': [4, 8, 16], 'dissimilarity_type': ['euclidean']}) all_projections['LLC'] = (drtoolbox.LLC(), {'k': [8, 12], 'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]}) all_projections['LLE'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['standard'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['LLTSA'] = (tapkee.LinearLocalTangentSpaceAlignment(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence) all_projections['LMDS'] = (tapkee.LandmarkMDS(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) all_projections['LMNN'] = (drtoolbox.LMNN(), {'k': [3, 5, 7], 'verbose': [False]}) all_projections['LMVU'] = (drtoolbox.LandmarkMVU(), {'k1': [3, 5, 7], 'k2': [8, 12, 15], 'verbose': [False]}) all_projections['LPP'] = (tapkee.LocalityPreservingProjections(), {'n_neighbors': [4, 7, 11], 'verbose': [False]}) # subject to "eigendecomposition failed" errors (Eigen's NoConvergence) all_projections['LSP'] = (vp.LSP(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'n_neighbors': [4, 8, 16], 'control_point_type': ['random', 'kmeans'], 'dissimilarity_type': ['euclidean']}) all_projections['LTSA'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['ltsa'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['MC'] = (drtoolbox.ManifoldChart(), {'n_analyzers': [10, 20], 'max_iter': [200, 400], 'verbose': [False]}) all_projections['MCML'] = (drtoolbox.MCML(), {'verbose': [False]}) all_projections['MDS'] = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [True], 'max_iter': [300, 500], 'random_state': [42]}) all_projections['MLLE'] = (manifold.LocallyLinearEmbedding(), {'n_components': [2], 'n_neighbors': [5, 7, 11], 'max_iter': [100, 200], 'reg': [0.001, 0.01, 0.1], 'method': ['modified'], 'eigen_solver': ['dense'], 'random_state': [42]}) all_projections['MVU'] = (drtoolbox.MVU(), {'k': [8, 12, 15], 'verbose': [False]}) all_projections['NMDS'] = (manifold.MDS(), {'n_components': [2], 'n_init': [2, 4], 'metric': [False], 'max_iter': [300, 500], 'random_state': [42]}) all_projections['NMF'] = (decomposition.NMF(), {'n_components': [2], 'init': ['random', 'nndsvdar'], 'beta_loss': ['frobenius'], 'max_iter': [200, 400], 'alpha': [0, 0.5], 'l1_ratio': [0.0, 0.5], 'random_state': [42]}) all_projections['PBC'] = (vp.ProjectionByClustering(), {'verbose': [False], 'fraction_delta': [2.0, 8.0, 12.0], 'n_iterations': [100, 200], 'init_type': ['fastmap', 'random'], 'dissimilarity_type': ['euclidean'], 'cluster_factor': [1.5, 4.5, 9.0]}) all_projections['PCA'] = (decomposition.PCA(), {'n_components': [2], 'random_state': [42]}) all_projections['PLSP'] = (vp.PLSP(), {'dissimilarity_type': ['euclidean'], 'verbose': [False], 'sample_type': ['clustering']}) all_projections['PPCA'] = (drtoolbox.ProbPCA(), {'max_iter': [200, 400], 'verbose': [False]}) all_projections['RSAM'] = (vp.RapidSammon(), {'verbose': [False], 'dissimilarity_type': ['euclidean']}) all_projections['SPCA'] = (decomposition.SparsePCA(), {'n_components': [2], 'alpha': [0.01, 0.1, 0.5], 'ridge_alpha': [0.05, 0.05, 0.5], 'max_iter': [1000, 2000], 'tol': [1e-08], 'method': ['lars'], 'random_state': [42], 'normalize_components': [True]}) all_projections['SPE'] = (tapkee.StochasticProximityEmbedding(), {'n_neighbors': [6, 12, 18], 'n_updates': [20, 70], 'max_iter': [0], 'verbose': [False]}) all_projections['SRP'] = (random_projection.SparseRandomProjection(), {'n_components': [2], 'density': ['auto'], 'random_state': [42]}) all_projections['TSNE'] = (mtsne.MTSNE(), {'n_components': [2], 'perplexity': [5.0, 15.0, 30.0, 50.0], 'early_exaggeration': [6.0, 12.0, 18.0], 'learning_rate': [200.0], 'n_iter': [1000, 3000], 'n_iter_without_progress': [300], 'min_grad_norm': [1e-07], 'metric': ['euclidean'], 'init': ['random'], 'random_state': [42], 'method': ['barnes_hut'], 'angle': [0.5], 'n_jobs': [4]}) all_projections['TSVD'] = (decomposition.TruncatedSVD(), {'n_components': [2], 'algorithm': ['randomized'], 'n_iter': [5, 10], 'random_state': [42]}) all_projections['UMAP'] = (umap.UMAP(), {'n_components': [2], 'random_state': [42], 'n_neighbors': [5, 10, 15], 'metric': ['euclidean'], 'init': ['spectral', 'random'], 'min_dist': [0.001, 0.01, 0.1, 0.5], 'spread': [1.0], 'angular_rp_forest': [False]})
return tmp from sklearn import decomposition l = AllStateDataLoader() print("Extraction data_2...") data_2 = l.get_data_2_train() print("Extraction data_3...") data_3 = l.get_data_3_train() print("Extraction data_all...") data_all = l.get_data_all_train() data_all_reindexed = duplicate_data(data_all) pca = decomposition.SparsePCA(n_components=3, verbose=True) X = get_X_without_scaler(data_all_reindexed) pca.fit(X) X_pca = pca.transform(X) x1 = -600 x2 = 400 y1 = -80 y2 = 80 plt.figure() plt.subplot(311) plt.plot(X_pca[data_all["real_A"] == 1,2], X_pca[data_all["real_A"] == 1,1], "b+") #plt.axis((x1,x2,y1,y2)) plt.subplot(312)
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, decomposition, manifold from itertools import cycle def load_data(): iris = datasets.load_iris() return iris.data, iris.target PCA_Set = [ decomposition.PCA(n_components=None), decomposition.PCA(svd_solver='randomized'), decomposition.SparsePCA(n_components=None), decomposition.IncrementalPCA(n_components=None), decomposition.KernelPCA(n_components=None, kernel='linear'), decomposition.KernelPCA(n_components=None, kernel='rbf'), decomposition.KernelPCA(n_components=None, kernel='poly'), decomposition.KernelPCA(n_components=None, kernel='sigmoid'), decomposition.FastICA(n_components=None) ] PCA_Set_Name = [ 'Default', 'Randomized', 'Sparse', 'Incremental', 'Kernel(linear)', 'Kernel(rbf)', 'Kernel(poly)', 'Kernel(sigmoid)', 'ICA' ] def plot_PCA(*data): X, Y = data fig = plt.figure("PCA", figsize=(20, 8))
import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns from sklearn import decomposition df = pd.DataFrame.from_csv('data_1k_num.csv', index_col=None) jb = df.JobRole.unique() #print jb for i in range(len(jb)): df.JobRole.replace(jb[i], i + 1, inplace=True) dept = df.Department.unique() for i in range(len(dept)): df.Department.replace(dept[i], i + 1, inplace=True) est = decomposition.SparsePCA(20, alpha=0.5, max_iter=100) c = est.fit(df) df1 = pd.DataFrame(est.components_) df1 = (df1) df1.to_csv("Reduced_Components.csv") print est.components_
def main(): progname = os.path.basename(sys.argv[0]) usage = """prog [options] <input stack> <output basis> [reprojections] This too provides a variety of dimensionality reduction methods. This new version uses scikit.learn, which provides a greater variety of algorithms, but must load all data into memory. If working with a large file, you may want to consider using --step to operate on a limited subset of the data. If specified, [reprojections] will contain projections of the full input stack (ignoring --step) into the basis subspace represented as a single image. This obviates the need for e2basis.py, and permits use of nonlinear decompositions. --- Performs multivariate statistical analysis on a stack of images. Writes a set of Eigenimages which can be uses as a basis set for reducing the dimensionality of a data set (noise reduction). Typically this basis set is then used to reproject the data (e2basis.py) and classify the data based on the projected vectors. If the output file supports arbitrary metadata (like HDF), Eigenvalues are stored in the 'eigval' parameter in each image. Note: The mean value is subtracted from each image prior to MSA calculation. The mean image is stored as the first image in the output file, though it is not part of the orthonormal basis when handled this way.""" parser = EMArgumentParser(usage=usage, version=EMANVERSION) parser.add_argument( "--mode", type=str, help="Mode should be one of: pca, sparsepca, fastica, factan, lda, nmf", default="pca") parser.add_argument( "--nomean", action="store_true", help="Suppress writing the average image as the first output image", default=False) parser.add_argument( "--nomeansub", action="store_true", help= "Suppress subtracting the mean from each input image, also implies --nomean", default=False) parser.add_argument("--nbasis", "-n", type=int, help="Number of basis images to generate.", default=20) parser.add_argument( "--maskfile", "-M", type=str, help= "File containing a mask defining the pixels to include in the Eigenimages" ) parser.add_argument( "--projin", type=str, default=None, help= "When generating subspace projections, use this file instead of the input used for the MSA" ) parser.add_argument( "--normproj", action="store_true", help= "When generating subspace projections, normalize each projection vector to unit length", default=False) parser.add_argument( "--mask", type=int, help= "Mask radius, negative values imply ny/2+1+mask, --mask=0 disables, --maskfile overrides", default=0) parser.add_argument( "--simmx", type=str, help= "Will use transformations from simmx on each particle prior to analysis" ) parser.add_argument( "--normalize", action="store_true", help= "Perform a careful normalization of input images before MSA. Otherwise normalization is not modified until after mean subtraction.", default=False) parser.add_argument( "--step", type=str, default="0,1", help= "Specify <init>,<step>[,last]. Processes only a subset of the input data. For example, 0,2 would process only the even numbered particles" ) parser.add_argument( "--ppid", type=int, help="Set the PID of the parent process, used for cross platform PPID", default=-1) parser.add_argument( "--verbose", "-v", dest="verbose", action="store", metavar="n", type=int, default=0, help= "verbose level [0-9], higher number means higher level of verboseness") #parser.add_argument("--gui",action="store_true",help="Start the GUI for interactive boxing",default=False) #parser.add_argument("--boxsize","-B",type=int,help="Box size in pixels",default=-1) #parser.add_argument("--dbin","-D",type=str,help="Filename to read an existing box database from",default=None) (options, args) = parser.parse_args() if len(args) < 2: parser.error("Input and output filenames required") logid = E2init(sys.argv, options.ppid) if options.verbose > 0: print("Beginning MSA") # Number of image s in the input file nfile = EMUtil.get_image_count(args[0]) try: step = [int(i) for i in options.step.split(",")] if len(step) == 1: step = (0, step[0], nfile) elif len(step) == 2: step.append(nfile) elif len(step) == 3: if step[2] <= 0: step[2] += nfile # undocumented negative final value permitted else: raise Exception except: print("Invalid --step specification") sys.exit(1) # setup mask image if options.maskfile: mask = EMData(options.maskfile, 0) if mask["mean_nonzero"] != 1.0: print("ERROR: maskfile must be a binary mask (1/0 only)") sys.exit(1) else: # default is no masking mask = EMData(args[0], 0) mask.to_one() # negative values handled by mask.sharp if options.mask != 0: mask.process_inplace("mask.sharp", {"outer_radius": options.mask}) # Memory usage warning >2G raw data n = (step[2] - step[0]) // step[1] nval = int(mask["square_sum"]) # print(args[0],n,nval) if options.verbose or n * nval > 500000000: print("Estimated memory usage (mb): ", n * nval * 4 / 2**20) # Read all image data into numpy array if options.simmx: data = simmx_get(args[0], options.simmx, mask, step) else: data = normal_get(args[0], mask, step) if options.normalize: for i in range(len(data)): data[i] /= np.linalg.norm(data[i]) # first output image is the mean of the input vectors, which has been subtracted from each vector try: os.unlink(args[1]) except: pass mean = np.mean(data, 0) if not options.nomeansub: for i in range(len(data)): data[i] -= mean #from_numpy(mean).process("misc.mask.pack",{"mask":mask,"unpack":1}).write_image(args[1],0) shift = 0 # This is where the actual action takes place! if options.mode == "pca": msa = skdc.PCA(n_components=options.nbasis) # print(data.shape) msa.fit(data) elif options.mode == "factan": msa = skdc.FactorAnalysis(n_components=options.nbasis) msa.fit(data) elif options.mode == "sparsepca": msa = skdc.SparsePCA(n_components=options.nbasis) # print(data.shape) msa.fit(data) elif options.mode == "fastica": msa = skdc.FastICA(n_components=options.nbasis, algorithm="parallel", max_iter=500, tol=0.001) msa.fit(data) elif options.mode == "lda": shift = max(-data.min() + data.std() * 0.5, data.std() * 4.0 - data.mean()) # we need positivity # if we are processing projections later, we need to try to insure that they will be positive as well if options.projin: nfile2 = EMUtil.get_image_count(options.projin) pmin = 0 pstd = 0 pmean = 0 pn = 0 for i in range(0, nfile2, nfile2 // 256): # read a scattering of images tmp = EMData(options.projin) pmin = min(pmin, tmp["minimum"]) pstd = max(pstd, tmp["sigma_nonzero"]) pmean += tmp["mean"] pn += 1 pmean /= pn shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean) shift = max(shift, shiftp) data += shift msa = skdc.LatentDirichletAllocation(n_components=options.nbasis, learning_method="online", verbose=1) msa.fit(data) elif options.mode == "nmf": shift = max(-data.min() + data.std() * 1.5, data.std() * 4.0 - data.mean()) # we need positivity # if we are processing projections later, we need to try to insure that they will be positive as well if options.projin: nfile2 = EMUtil.get_image_count(options.projin) pmin = 0 pstd = 0 pmean = 0 pn = 0 for i in range(0, nfile2, nfile2 // 256): # read a scattering of images tmp = EMData(options.projin) pmin = min(pmin, tmp["minimum"]) pstd = max(pstd, tmp["sigma_nonzero"]) pmean += tmp["mean"] pn += 1 pmean /= pn shiftp = max(pmin + pstd * 0.5, pstd * 4.0 - pmean) shift = max(shift, shiftp) data += shift msa = skdc.NMF(n_components=options.nbasis, init="nndsvd") msa.fit(data) # write mean if not options.nomean and not options.nomeansub: mn = from_numpy(mean).process("misc.mask.pack", { "mask": mask, "unpack": 1 }) mn["eigval"] = 0 # we add this artifically to the mean image, both to mark it, and to make some other code requiring it work. It isn't meaningful as a value, obviously mn.write_image(args[1], 0) # print(msa.components_.shape) # c=from_numpy(msa.components_.copy()).write_image("z.hdf",0) if options.verbose > 0: print("MSA complete") # write other basis vectors if options.nomean or options.nomeansub: offset = 0 else: offset = 1 for i, v in enumerate(msa.components_): im = from_numpy(v.copy()).process("misc.mask.pack", { "mask": mask, "unpack": 1 }) if options.mode == "pca": im["eigval"] = float(msa.singular_values_[i]) im["explvarfrac"] = float(msa.explained_variance_ratio_[i]) if options.verbose: print("Explained variance: ", im["explvarfrac"], "\tSingular Value: ", im["eigval"]) elif options.mode == "fastica": if im["sigma"] > 0: im.mult(1.0 / im["sigma"] ) # fastica seems to produce very small vector lengths im.write_image(args[1], i + offset) # if requested we use the model to generate reprojections of the full set of input images # into the new subspace. This permits use of nonlinear algorithms (the components_ output # is not directly usable) if len(args) > 2: try: os.unlink(args[2]) except: pass if options.projin != None: images = options.projin nfile2 = EMUtil.get_image_count(images) step2 = [0, 1, nfile2] else: nfile2 = nfile step2 = step images = args[0] if options.verbose: print("Reprojecting input data into subspace") chunksize = min(max(2, 250000000 // nval), step2[2]) # limiting memory usage for this step to ~2G out = EMData( options.nbasis, step2[2] ) # we hold the full set of reprojections in memory, though start = 0 while (start < step2[2]): stept = [start, 1, min(step2[2], start + chunksize)] if options.verbose: print(stept) # read a chunk of data if options.simmx: chunk = simmx_get(images, options.simmx, mask, stept) else: chunk = normal_get(images, mask, stept) if shift != 0: chunk += shift # for methods requiring positivity if chunk.min() <= 0: print( "ERROR: Results invalid, negative values. Shifting to prevent crash. Chunk ", stept, " has mean=", chunk.mean(), "std=", chunk.std(), "min=", chunk.min()) chunk += -chunk.min() proj = msa.transform(chunk) # into subspace if options.normproj: for i in range(len(proj)): proj[i] /= np.linalg.norm(proj[i]) im = from_numpy(proj.copy()) out.insert_clip(im, (0, start, 0)) start += chunksize # write results out.write_image(args[2], 0) E2end(logid) if options.mode not in ("pca", "sparsepca", "fastica"): print( "WARNING: While projection vectors are reliable, use of modes other than PCA or ICA may involve nonlinarities, meaning the 'Eigenimages' may not be interpretable in the usual way." )
def dim_reduction(x, alg='pca', n_comp=2048): if alg == 'pca': return decomposition.PCA(n_components=n_comp).fit_transform(x) else: return decomposition.SparsePCA(n_components=n_comp).fit_transform(x)
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
def get_search_params(params_builder): search_params = {} safe_eval = SafeEval(load_scipy=True, load_numpy=True) safe_eval_es = SafeEval(load_estimators=True) for p in params_builder['param_set']: search_p = p['search_param_selector']['search_p'] if search_p.strip() == '': continue param_type = p['search_param_selector']['selected_param_type'] lst = search_p.split(':') assert ( len(lst) == 2 ), "Error, make sure there is one and only one colon in search parameter input." literal = lst[1].strip() param_name = lst[0].strip() if param_name: if param_name.lower() == 'n_jobs': sys.exit("Parameter `%s` is invalid for search." % param_name) elif not param_name.endswith('-'): ev = safe_eval(literal) if param_type == 'final_estimator_p': search_params['estimator__' + param_name] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev else: # only for estimator eval, add `-` to the end of param #TODO maybe add regular express check ev = safe_eval_es(literal) for obj in ev: if 'n_jobs' in obj.get_params(): obj.set_params(n_jobs=N_JOBS) if param_type == 'final_estimator_p': search_params['estimator__' + param_name[:-1]] = ev else: search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev elif param_type != 'final_estimator_p': #TODO regular express check ? ev = safe_eval_es(literal) preprocessors = [ preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0) ] newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessors[0:36]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessors[0:8]) elif obj == 'fs_all': newlist.extend(preprocessors[8:15]) elif obj == 'decomp_all': newlist.extend(preprocessors[15:26]) elif obj == 'k_appr_all': newlist.extend(preprocessors[26:30]) elif obj == 'reb_all': newlist.extend(preprocessors[31:36]) elif obj == 'imb_all': newlist.extend(preprocessors[36:55]) elif type(obj) is int and -1 < obj < len(preprocessors): newlist.append(preprocessors[obj]) elif hasattr(obj, 'get_params'): # user object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported preprocessor type: %r" % (obj)) search_params['preprocessing_' + param_type[5:6]] = newlist else: sys.exit("Parameter name of the final estimator can't be skipped!") return search_params
def main(): results = [] args = parse_args() # there's a bug in the _sparseness method in sklearn's nmf module that is # hit in some edge cases. The value it computes isn't actually needed in # this case, so we can just ignore this divide by 0 error np.seterr(invalid="ignore") print("Processing %s" % args.data_file) full_mtx = np.loadtxt(args.data_file, delimiter=',') # normalize / clean-up our matrix: remove genes that are always 0, and # convert all non-zero values to 1 min_col_mutation_pct = 1.5 # columns with < this percentage of mutations will be deleted full_mtx = Matrix.trim_cols(full_mtx, 100 - min_col_mutation_pct) full_mtx = Matrix.to_binary(full_mtx) partitions = ({ 'start_row': 0, 'end_row': 269, 'name': "COAD" }, { 'start_row': 270, 'end_row': 689, 'name': "KIRC" }, { 'start_row': 690, 'end_row': 941, 'name': "PRAD" }, { 'start_row': 942, 'end_row': 1284, 'name': "SKCM" }, { 'start_row': 1285, 'end_row': 1341, 'name': "UCS" }) # create an array with the classes of each sample classes = list() for partition_idx, partition in enumerate(partitions): for row_idx in range(partition['start_row'], partition['end_row'] + 1): classes.append(partition_idx) l_reg = LogisticRegression() lreg_mtx = l_reg.fit_transform(full_mtx, classes) accuracy = l_reg.score(full_mtx, classes) print("accuracy = %f" % accuracy) print("regression dimensions : %d, %d" % (len(lreg_mtx), len(lreg_mtx[0]))) print("Computing Sparse PCA projection") pca_mtx = decomposition.SparsePCA(n_components=100).fit_transform(lreg_mtx) pca_mtx = Matrix.to_binary(pca_mtx) #rank_range = range(2,3) rank_range = range(2, 11) for partition in partitions: print("Processing %s cancer" % partition['name']) mtx = pca_mtx[partition['start_row']:partition['end_row'] + 1] mtx = Matrix.trim_rows(mtx) # remove empty rows print("Matrix is %d by %d and %f sparse" % (len(mtx), len(mtx[0]), Matrix.get_sparsity(mtx))) mtx = np.matrix.transpose( mtx) # transpose to put samples into columns, genes into rows print( "=====> Finding the optimum # clusters (range = %d to %d) <=====" % (rank_range[0], rank_range[-1])) results = list() for num_clusters in rank_range: print("Trying cluster size %d " % num_clusters) nmf = Nmf(mtx, clusters=num_clusters) c, w, h = nmf.get_consensus_matrix() coph = nmf.coph_cor(c) results.append({ 'rank': num_clusters, 'consensus': Matrix.reorder(c), 'w': w, 'h': h, 'coph': coph }) best = max(results, key=lambda x: x['coph']) worst = min(results, key=lambda x: x['coph']) print("worst: rank %d with %f" % (worst['rank'], worst['coph'])) print("best: rank %d with %f" % (best['rank'], best['coph'])) print("all:", [x['coph'] for x in results]) base_name = join(args.out_dir, partition['name']) Matrix.to_cophenetic_plot(rank_range, [x['coph'] for x in results], base_name + "_cophenetic.png") Matrix.to_heat_map(mtx, base_name + "_a.png") Matrix.to_heat_map(best['w'], base_name + "_w.png") Matrix.to_heat_map(best['h'], base_name + "_h.png") #sort h matrix ordered_h = np.sort(best['h'], axis=0) Matrix.to_line_plot(ordered_h, base_name + "_h_line.png") Matrix.to_consensus_plot(best['consensus'], best['rank'], base_name) Matrix.to_consensus_plot(results[0]['consensus'], results[0]['rank'], base_name) Matrix.to_consensus_plot(results[1]['consensus'], results[1]['rank'], base_name) Matrix.to_consensus_plot(results[2]['consensus'], results[2]['rank'], base_name) Matrix.to_consensus_plot(results[3]['consensus'], results[3]['rank'], base_name)