def test_compare_with_sklearn(svd_solver, batch_number): X = iris.data X_da = da.from_array(X, chunks=(3, -1)) batch_size = X.shape[0] // batch_number ipca = sd.IncrementalPCA(n_components=2, batch_size=batch_size) ipca.fit(X) ipca_da = IncrementalPCA( n_components=2, batch_size=batch_size, svd_solver=svd_solver ) ipca_da.fit(X_da) np.testing.assert_allclose(ipca.components_, ipca_da.components_, atol=1e-13) np.testing.assert_allclose( ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13 ) np.testing.assert_allclose( ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13 ) np.testing.assert_allclose( ipca.explained_variance_ratio_, ipca_da.explained_variance_ratio_, atol=1e-13 ) if svd_solver == "randomized": # noise variance in randomized solver is probabilistic. assert_almost_equal(ipca.noise_variance_, ipca_da.noise_variance_, decimal=1) else: np.testing.assert_allclose( ipca.noise_variance_, ipca_da.noise_variance_, atol=1e-13 )
def main(): df = pd.read_csv("data.csv").dropna() inputs = zip(list(df['refcode']), list(df['smiles'])) print len(inputs) with futures.ProcessPoolExecutor(max_workers=7) as executor: a = [i for i in executor.map(get_fps, inputs) if i[0] is not None] print len(a) fps, refcodes = zip(*a) X = np.array(fps) cutoff = 0.999 sel = VarianceThreshold(threshold=(cutoff * (1 - cutoff))) X2 = sel.fit_transform(X) pca = decomposition.IncrementalPCA(n_components=3, batch_size=50000) X_pca = pca.fit_transform(X2) # Dense data required print X_pca.explained_variance_ x = X_pca[:, 0] y = X_pca[:, 1] out_df = pd.DataFrame({"x": x, "y": y, "refcode": refcodes}) out_df.to_csv("pca_data_3d.csv")
def main_plot_pca(data, n_comps): print "training the PCA model" pca = decomposition.IncrementalPCA(n_components=n_comps, batch_size=400) data_trans = pca.fit_transform(data) print "pca model trained, variance:" print(pca.explained_variance_ratio_) return pca, data_trans '''plt.figure(1, figsize=(4, 3))
def pca_incremental(X, PC=2): print "PCA....." print("Incremental PCA, using %3d principal components" % PC) scaler = preprocessing.StandardScaler().fit(X) X_centered = scaler.transform(X) X_pca = decomposition.IncrementalPCA( n_components=PC).fit_transform(X_centered) return X_pca
def my_pca(features, labels, num_pca): pca = decomposition.IncrementalPCA(batch_size=50) pca.n_components = num_pca pca.fit(features) X_reduced = pca.transform(features) print(len(np.unique(labels))) print('PCA is fitted') return pca, X_reduced
def __init__(self, type='auto', *args, **kwargs): if type == 'auto': self.model = decomposition.PCA(*args, **kwargs) elif type == 'incremental': self.model = decomposition.IncrementalPCA(*args, **kwargs) elif type == 'kernel': self.model = decomposition.KernelPCA(*args, **kwargs) else: raise ValueError('The type ', type, ' does not exist.')
def IncrementalPCA(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = decomposition.IncrementalPCA(n_components=2) result = {} result['data'] = pca.fit_transform(data_source) params = 0.0 for j in pca.explained_variance_ratio_: params = params + j result['params'] = params return result
def incrementalPCA(*data): X, y = data fig = plt.figure() # IncrementalPCA用于超大规模数据,将数据分批加载进内存 ax1 = fig.add_subplot(1, 1, 1) incPCA = decomposition.IncrementalPCA(n_components=2,batch_size=10) incPCA.fit(X) newData1=incPCA.transform(X) color = ['blue', 'black', 'red'] for i in range(newData1.shape[0]): ax1.scatter(newData1[i, 0], newData1[i, 1], color=color[y[i]]) ax1.set_title('3 class after dimensionality reduction using IncrementalPCA') ax1.legend(loc='best') plt.show()
def PPA_batches(matrix, batch_size): # PCA to get Top Components n = matrix.shape[1] pca = decomposition.IncrementalPCA(n_components=n, batch_size=batch_size) X_train = matrix - np.mean(matrix) X_fit = pca.fit_transform(X_train) U1 = pca.components_ z = [] # Removing Projections on Top Components for i, x in enumerate(X_train): for u in U1[0:7]: x = x - np.dot(u.transpose(), x) * u z.append(x) return np.asarray(z)
def test_train_incremental_pca(self): iris = datasets.load_iris() iris = iris.data[: BATCH_SIZE * BATCHES] pca = decomposition.IncrementalPCA( n_components=COMPONENTS, batch_size=BATCH_SIZE ) pca.fit(iris) proj_ref = pca.transform(iris) batch = tf.placeholder(dtype=tf.float32, shape=[BATCH_SIZE, FEATURES]) step = tf.train.get_or_create_global_step() train_op, _ = train_incremental_pca(step, batch, COMPONENTS) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) for iris_batch in np.split(iris, BATCHES): sess.run(train_op, feed_dict={batch: iris_batch}) proj = sess.run(incremental_pca(iris, COMPONENTS)) self.assertAllClose(proj, proj_ref)
def my_pca(features, labels, num_pca): #mean_feature = np.mean(features, axis=0) #normalized_features = features - mean_feature #pca = decomposition.PCA(svd_solver='randomized') pca = decomposition.IncrementalPCA(batch_size=50) pca.n_components = num_pca pca.fit(features) print('Remained variance is:') print(pca.explained_variance_ratio_) X_reduced = pca.transform(features) print('PCA is fitted') if num_pca==2: plt.figure() for c, i in zip("rgbcmyk", [0, 1, 2, 3, 4, 5, 6]): plt.scatter(X_reduced[labels == i, 0], X_reduced[labels == i, 1], c=c) plt.title('PCA of features') return pca, X_reduced
def fit_pca_to_macs(loader, device, log_interval, whiten): """Extract MACs and use them to fit a sklearn PCA""" print('Fitting PCA to whiten MACs...') feature_transform = feature_transforms.MACStackTransform() pca = None for idx, features in enumerate( transform_features(feature_transform, loader, device)): features = features.cpu().numpy() if pca is None: # Lazy init num_features = features.shape[1] pca = decomposition.IncrementalPCA(n_components=num_features, whiten=whiten) pca.partial_fit(features) if idx % log_interval == 0: print('Fitted batch {}/{}'.format(idx, len(loader))) return pca
def test_ipca(): iris = datasets.load_iris() X = iris.data y = iris.target n_components = 2 ipca = decomposition.IncrementalPCA(n_components=n_components, batch_size=10) X_ipca_org = ipca.fit_transform(X) mean = ipca.mean_ components = ipca.components_ X_ipca = X - mean X_ipca = np.dot(X_ipca, components.T) pca = decomposition.PCA(n_components=n_components) X_pca = pca.fit_transform(X) colors = ['navy', 'turquoise', 'darkorange'] for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: plt.figure(figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], color=color, lw=2, label=target_name) if "Incremental" in title: err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean() plt.title(title + " of iris dataset\nMean absolute unsigned error " "%.6f" % err) else: plt.title(title + " of iris dataset") plt.legend(loc="best", shadow=False, scatterpoints=1) plt.axis([-4, 4, -1.5, 1.5]) plt.show()
def decompose(embeddings, n_components, batch_size=None): """Apply CPA over input dataset. If batch size is not None, use incremental PCA.""" embeddings = pd.DataFrame(embeddings).T.reset_index() embeddings = embeddings.set_index('index') if batch_size is None: PCA = decomposition.PCA(n_components=n_components) else: PCA = decomposition.IncrementalPCA(n_components=n_components, batch_size=batch_size) X = PCA.fit_transform(embeddings) output = {} for i, label in enumerate(embeddings.index): output[label] = X[i] return output
def row_embeddings(df, embeddings, prefix, n_components=2, batch_size=None): """Map embeddings on input dataframe and apply PCA on the input dataframe. Apply PCA after mapping embeddings. """ e = {} for key, value in embeddings.items(): e[key] = value.tolist() X = [] for column in df.columns: if column in prefix: embedding = (prefix[column] + df[column].astype(str)).map(e) else: embedding = df[column].astype(str).map(e) X.append(pd.DataFrame(dict(zip(df.index, embedding))).T) X = pd.concat(X, axis='columns') if batch_size is None: PCA = decomposition.PCA(n_components=n_components) else: PCA = decomposition.IncrementalPCA(n_components=n_components, batch_size=batch_size) X = PCA.fit_transform(X) X = pd.DataFrame(X) X.columns = [f'dim_{i}' for i in range(len(X.columns))] X.index = df.index return X
def validate(val_loader, net, criterion, epoch, num_known_classes, num_unknown_classes, hidden, train_args): model_list = [] # Setting network for evaluation mode. net.eval() count = 0 n_patches = 0 if dataset_name == 'Vaihingen': n_patches = 907 # Vaihingen. elif dataset_name == 'Potsdam': n_patches = 12393 # 8993 # Potsdam. np.random.seed(12345) with torch.no_grad(): ipca_training_time = [0.0 for c in range(num_known_classes)] # Creating output directory. check_mkdir(os.path.join(outp_path, exp_name, 'epoch_' + str(epoch))) for c in range(num_known_classes): # Computing PCA models from features. model = decomposition.IncrementalPCA(n_components=args['n_components']) model_list.append(model) for i, data in enumerate(val_loader): print('Validation Batch %d/%d' % (i + 1, len(val_loader))) sys.stdout.flush() # Obtaining images, labels and paths for batch. inps_batch, labs_batch, true_batch, img_name = data inps_batch = inps_batch.squeeze() labs_batch = labs_batch.squeeze() true_batch = true_batch.squeeze() # Iterating over patches inside batch. for j in range(inps_batch.size(0)): print(' Validation MiniBatch %d/%d' % (j + 1, inps_batch.size(0))) sys.stdout.flush() for k in range(inps_batch.size(1)): inps = inps_batch[j, k].unsqueeze(0) labs = labs_batch[j, k].unsqueeze(0) true = true_batch[j, k].unsqueeze(0) # Casting tensors to cuda. inps, labs, true = inps.cuda(args['device']), labs.cuda(args['device']), true.cuda(args['device']) # Casting to cuda variables. inps = Variable(inps).cuda(args['device']) labs = Variable(labs).cuda(args['device']) true = Variable(true).cuda(args['device']) # Forwarding. if conv_name == 'fcnwideresnet50': outs, classif1, fv2 = net(inps, feat=True) elif conv_name == 'fcndensenet121': outs, classif1, fv2 = net(inps, feat=True) # Computing loss. soft_outs = F.softmax(outs, dim=1) # Obtaining predictions. prds = soft_outs.data.max(1)[1] if conv_name == 'fcnwideresnet50': feat_flat = torch.cat([outs.squeeze(), classif1.squeeze(), fv2.squeeze()], 0) elif conv_name == 'fcndensenet121': feat_flat = torch.cat([outs.squeeze(), classif1.squeeze(), fv2.squeeze()], 0) feat_flat = feat_flat.permute(1, 2, 0).contiguous().view(feat_flat.size(1) * feat_flat.size(2), feat_flat.size(0)).cpu().numpy() prds_flat = prds.cpu().numpy().ravel() true_flat = true.cpu().numpy().ravel() for c in range(num_known_classes): tic = time.time() model_list[c] = partial_fit_ipca_model(model_list[c], feat_flat, true_flat, prds_flat, c) toc = time.time() ipca_training_time[c] += (toc - tic) for c in range(num_known_classes): print('Time spent fitting model %d: %.2f' % (c, ipca_training_time[c])) model_full = {'generative': model_list} # Saving model on disk. model_path = os.path.join(outp_path, exp_name, 'model_pca.pkl') print('Saving model at "%s"...' % (model_path)) sys.stdout.flush() joblib.dump(model_full, model_path) return model_full
f_pca = f_pca.get_data() f_pca = f_all[f_pca==1] f_pca = f_pca.reshape(-1) f_part = f_all[label_img==1] #f = np.array(f) #d_std = StandardScaler().fit_transform(d) print f_part.shape X[c,idx,:f_part.shape[0]] = (f_part-f_part.min())/(f_part.max()-f_part.min()) #f_pca_std = StandardScaler().fit_transform(f_pca) X_pca[c,idx,:f_pca.shape[0]] = (f_pca-f_pca.min())/(f_pca.max()-f_pca.min()) ### fit patient data #### #指标pca print 'pca starting' #存储pca模型 zhibiao_pca = decomposition.IncrementalPCA(n_components=3) zhibiao_pca.fit(X_pca.reshape(classnum,-1).T) #joblib.dump(zhibiao, os.getcwd()+'/modelsave/zhibiao_pca_5000.pkl') #zhibiao_pca = joblib.load(os.getcwd()+'/modelsave/zhibiao_pca_5000.pkl') X_outpca = zhibiao_pca.transform(X.reshape(classnum,-1).T) #中脑或者全脑 ############################################################################### #得出X,Y的数据 X_learning = np.zeros([all_num,data_shape*2]) #(n_samples, n_features) for i in range(all_num): X_learning[i,:data_shape]=X_outpca[i*data_shape:(i+1)*data_shape,0] X_learning[i,data_shape:]=X_outpca[i*data_shape:(i+1)*data_shape,1] Y_learning = np.zeros(all_num,int) Y_learning[data_num:] = np.ones(pat_num,int)
def _eval_search_params(params_builder): search_params = {} for p in params_builder['param_set']: search_list = p['sp_list'].strip() if search_list == '': continue param_name = p['sp_name'] if param_name.lower().endswith(NON_SEARCHABLE): print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) continue if not search_list.startswith(':'): safe_eval = SafeEval(load_scipy=True, load_numpy=True) ev = safe_eval(search_list) search_params[param_name] = ev else: # Have `:` before search list, asks for estimator evaluatio safe_eval_es = SafeEval(load_estimators=True) search_list = search_list[1:].strip() # TODO maybe add regular express check ev = safe_eval_es(search_list) preprocessings = ( preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(), preprocessing.PolynomialFeatures(), preprocessing.RobustScaler(), feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(), feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(), feature_selection.SelectFwe(), feature_selection.VarianceThreshold(), decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.TruncatedSVD(random_state=0), kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0), kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0), cluster.FeatureAgglomeration(), skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS), skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.CondensedNearestNeighbour( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RepeatedEditedNearestNeighbours( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.InstanceHardnessThreshold( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.NeighbourhoodCleaningRule( random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.RandomOverSampler(random_state=0), imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)) newlist = [] for obj in ev: if obj is None: newlist.append(None) elif obj == 'all_0': newlist.extend(preprocessings[0:35]) elif obj == 'sk_prep_all': # no KernalCenter() newlist.extend(preprocessings[0:7]) elif obj == 'fs_all': newlist.extend(preprocessings[7:14]) elif obj == 'decomp_all': newlist.extend(preprocessings[14:25]) elif obj == 'k_appr_all': newlist.extend(preprocessings[25:29]) elif obj == 'reb_all': newlist.extend(preprocessings[30:35]) elif obj == 'imb_all': newlist.extend(preprocessings[35:54]) elif type(obj) is int and -1 < obj < len(preprocessings): newlist.append(preprocessings[obj]) elif hasattr(obj, 'get_params'): # user uploaded object if 'n_jobs' in obj.get_params(): newlist.append(obj.set_params(n_jobs=N_JOBS)) else: newlist.append(obj) else: sys.exit("Unsupported estimator type: %r" % (obj)) search_params[param_name] = newlist return search_params
# print(df_na_analyse) fill_na(df) numerize(df) # print(len(df)) df = df.dropna() df = shuffle(df).reset_index() # print(len(df)) # récupérer entrées et sorties X, y = get_x_y(df, u'VARIABLE_CIBLE') # Analyse en composantes principales # X -= X.mean() pca = decomposition.IncrementalPCA(n_components=25) pca.fit(X) X = pca.transform(X) X = preprocessing.scale(X) # print(X.shape) # Créer les datasets de train, validation et test X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=.4) X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=.5) # print(X_train.shape) # print(X_valid.shape) # print(X_test.shape) # iterate over classifiers for name, clf in zip(names, classifiers):
def streaming_PCA(samples, n_components=2, batch_size=100): ipca = decomposition.IncrementalPCA(n_components=n_components, batch_size=batch_size) tz.pipe(samples, cur.partition(batch_size), cur.map(np.array), cur.map(ipca.partial_fit), tz.last) return ipca
iris = myML.DataPre.load_datasets("iris") # 使用 scikit-learn 自带的 iris 数据集 X, y = iris.data, iris.target # 测试 PCA 的用法 (注意:此PCA基于scipy.linalg来实现SVD分解,因此不能应用于实数矩阵,并且无法适用于超大规模数据。) pca = decomposition.PCA(n_components=None) # 使用默认的 n_components pca.fit(X) print('explained variance ratio : %s' % str(pca.explained_variance_ratio_)) # 绘制经过 PCA 降维到二维之后的样本点 myML.DimReduce.plotparam_decomposition(X, y, "decomposition.PCA()", n_components=[2]) # 超大规模数据降维 IncrementalPCA pca = decomposition.IncrementalPCA(n_components=None) # 使用默认的 n_components pca.fit(X) print('explained variance ratio : %s' % str(pca.explained_variance_ratio_)) # ---核化线性降维 KernelPCA from sklearn import decomposition iris = myML.DataPre.load_datasets("iris") # 使用 scikit-learn 自带的 iris 数据集 X, y = iris.data, iris.target # 测试 KernelPCA 的用法 kernels = ['linear', 'poly', 'rbf'] for kernel in kernels: kpca = decomposition.KernelPCA(n_components=None, kernel=kernel).fit(X) # 依次测试四种核函数 print('kernel=%s --> lambdas: %s' % (kernel, kpca.lambdas_))
def Bassan(wavenumbers, App, m0, n_components=8, iterations=1, w_regions=None): """ Correct scattered spectra using Bassan's algorithm. :param wavenumbers: array of wavenumbers :param App: apparent spectrum :param m0: reference spectrum :param n_components: number of principal components to be calculated for the extinction matrix :param iterations: number of iterations of the algorithm :param w_regions: the regions to be taken into account for the fitting :return: corrected apparent spectrum """ # Copy the input data wn = np.copy(wavenumbers) A_app = np.copy(App) m_0 = np.copy(m0) ii = np.argsort(wn) # Sort the wavenumbers # Apply the sorting to the input variables wn = wn[ii] A_app = A_app[ii] m_0 = m_0[ii] # Define the weighted regions: if w_regions is not None: m_0 = correct_reference( np.copy(m_0), wn, a, d, w_regions) # Correct the reference spectrum as in Kohler method w_indexes = [] # Get the indexes of the regions to be taken into account for pair in w_regions: min_pair = min(pair) max_pair = max(pair) ii1 = find_nearest_number_index(wn, min_pair) ii2 = find_nearest_number_index(wn, max_pair) w_indexes.extend(np.arange(ii1, ii2)) # Take the weighted regions of wavenumbers, apparent and reference spectrum wn_w = np.copy(wn[w_indexes]) A_app_w = np.copy(A_app[w_indexes]) m_0_w = np.copy(m_0[w_indexes]) n_loadings = 10 # Number of values to be computed for each parameter (a, b, d) a = np.linspace(1.1, 1.5, n_loadings) # Average refractive index d = np.linspace(2.0, 8.0, n_loadings) * 1.0e-4 # Cell diameter Q = np.zeros((n_loadings**3, len(wn))) # Initialize the extinction matrix m_n = np.copy( m_0 ) # Initialize the reference spectrum, that will be updated after each iteration for iteration in range(iterations): # Compute the scaled real part of the refractive index by Kramers-Kronig transform: nkk = -1.0 * np.imag(hilbert(m_n)) # Build the extinction matrix n_row = 0 for i in range(n_loadings): b = np.linspace(0.0, a[i] - 1.0, 10) # Range of amplification factors of nkk for j in range(n_loadings): for k in range(n_loadings): n = a[i] + b[j] * nkk # Compute the real refractive index alpha = 2.0 * np.pi * d[k] * (n - 1.0) rho = alpha * wn # Compute the extinction coefficients for each combination of a, b and d: Q[n_row] = 2.0 - np.divide(4.0, rho) * np.sin(rho) + \ np.divide(4.0, rho ** 2.0) * (1.0 - np.cos(rho)) n_row += 1 # Orthogonalization of th extinction matrix with respect to the reference spectrum: for i in range(n_loadings**3): Q[i] -= np.dot(Q[i], m_0) / np.linalg.norm(m_0)**2.0 * m_0 # Perform PCA of the extinction matrix pca = skl_decomposition.IncrementalPCA(n_components=n_components) pca.fit(Q) p_i = pca.components_ # Get the principal components if w_regions is None: # If all regions have to be taken into account: def min_fun(x): """ Function to be minimized for the fitting :param x: fitting parameters (offset, baseline, reference's linear factor, PCA scores) :return: squared norm of the difference between the apparent spectrum and its fitting """ cc, mm, hh, g = x[0], x[1], x[2], x[3:] return np.linalg.norm(A_app - apparent_spectrum_fit_function_Bassan( wn, m_0, p_i, cc, mm, hh, g))**2.0 else: # If only the specified regions have to be taken into account: # Take the indexes of the specified regions w_indexes = [] for pair in w_regions: min_pair = min(pair) max_pair = max(pair) ii1 = find_nearest_number_index(wn, min_pair) ii2 = find_nearest_number_index(wn, max_pair) w_indexes.extend(np.arange(ii1, ii2)) p_i_w = np.copy( p_i[:, w_indexes] ) # Get the principal components of the extinction matrix at the # specified regions def min_fun(x): """ Function to be minimized for the fitting :param x: fitting parameters (offset, baseline, reference's linear factor, PCA scores) :return: squared norm of the difference between the apparent spectrum and its fitting """ cc, mm, hh, g = x[0], x[1], x[2], x[3:] return np.linalg.norm( A_app_w - apparent_spectrum_fit_function_Bassan( wn_w, m_0_w, p_i_w, cc, mm, hh, g))**2.0 p0 = np.append([1.0, 0.0005, 0.9], np.ones(n_components)) # Initial guess for the fitting res = scipy.optimize.minimize(min_fun, p0, method='Powell') # Perform the fitting # print(res) # Print the result of the minimization # assert(res.success) # Raise AssertionError if res.success == False c, m, h, g_i = res.x[0], res.x[1], res.x[2], res.x[ 3:] # Take the fitted parameters Z_corr = (A_app - c - m * wn - np.dot(g_i, p_i)) / h # Apply the correction m_n = np.copy( Z_corr ) # Take the corrected spectrum as the reference for the next iteration return np.copy( Z_corr[::-1] ) # Return the corrected spectrum in inverted order for compatibility
def Konevskikh(wavenumbers, App, m0, n_components=8, iterations=1): """ Correct scattered spectra using Konevskikh algorithm :param wavenumbers: array of wavenumbers :param App: apparent spectrum :param m0: reference spectrum :param n_components: number of components :param iterations: number of iterations :return: corrected spectrum """ # Copy the input variables wn = np.copy(wavenumbers) A_app = np.copy(App) m_0 = np.copy(m0) ii = np.argsort(wn) # Sort the wavenumbers wn = wn[ii] A_app = A_app[ii] m_0 = m_0[ii] # Initialize parameters range: alpha_0, gamma = np.array([ np.logspace(np.log10(0.1), np.log10(2.2), num=10) * 4.0e-4 * np.pi, np.logspace(np.log10(0.05e4), np.log10(0.05e5), num=10) * 1.0e-2 ]) p0 = np.ones(2 + n_components) Q_ext = np.zeros( (len(alpha_0) * len(gamma), len(wn))) # Initialize extinction matrix m_n = np.copy(m_0) # Copy the reference spectrum for n_iteration in range(iterations): ns_im = np.divide( m_n, wn) # Compute the imaginary part of the refractive index # Compute the real part of the refractive index by Kramers-Kronig transform ns_re = -1.0 * np.imag(hilbert(ns_im)) # Compute the extinction matrix n_index = 0 for i in range(len(alpha_0)): for j in range(len(gamma)): for k in range(len(A_app)): rho = alpha_0[i] * (1.0 + gamma[j] * ns_re[k]) * wn[k] beta = np.arctan(ns_im[k] / (1.0 / gamma[j] + ns_re[k])) Q_ext[n_index][k] = 2.0 - 4.0 * np.exp(-1.0 * rho * np.tan(beta)) * (np.cos(beta) / rho) * \ np.sin(rho - beta) - 4.0 * np.exp(-1.0 * rho * np.tan(beta)) * (np.cos(beta) / rho) ** 2.0 * \ np.cos(rho - 2.0 * beta) + 4.0 * (np.cos(beta) / rho) ** 2.0 * np.cos(2.0 * beta) # TODO: rewrite this in a simpler way n_index += 1 # Orthogonalize the extinction matrix with respect to the reference: for i in range(n_index): Q_ext[i][:] -= np.dot(Q_ext[i][:], m_0) / np.linalg.norm(m_0)**2.0 * m_0 # Q_ext = GramSchmidt(np.copy(Q_ext)) # Apply Gram-Schmidt othogonalization to Q_ext (don't uncomment this) # Compute PCA of the extinction matrix pca = skl_decomposition.IncrementalPCA(n_components=n_components) pca.fit(Q_ext) p_i = pca.components_ # Get the principal components def min_fun(x): bb, cc, g = x[0], x[1], x[2:] return np.linalg.norm( A_app - apparent_spectrum_fit_function(wn, m_0, p_i, bb, cc, g))**2.0 res = scipy.optimize.minimize(min_fun, p0, method='Powell') # print(res) # Print the minimization results # assert(res.success) # Raise AssertionError if res.success == False b, c, g_i = res.x[0], res.x[1], res.x[2:] # Get the fitted parameters Z_corr = (A_app - c - np.dot(g_i, p_i)) / b # Apply the correction m_n = np.copy(Z_corr) # Update te reference with the correction return Z_corr[::-1] # Return the corrected spectrum
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) plot_embedding(X_projected, "Random Projection of the digits") # ---------------------------------------------------------------------- # Projection on to the first 2 principal components #PCA substracts de mean, unlike SVD. If your features are least sensitive (informative) #towards the mean of the distribution, then it makes sense to subtract the mean. #If the features are most sensitive towards the high values, then subtracting the mean does not make sense. print("Computing PCA projection") t0 = time() X_svd = decomposition.TruncatedSVD(n_components=2).fit_transform(X) X_pca = decomposition.PCA(n_components=2).fit_transform( X) #center but doest not scale data before aplying SVD X_ipca = decomposition.IncrementalPCA(n_components=2).fit_transform( X) #for large dataset that do not fit in memory plot_embedding(X_svd, "Singular value decomposition projection of the digits ") plot_embedding(X_pca, "Principal Components projection of the digits ") plot_embedding(X_ipca, "Incremental Principal Components projection of the digits ") # ---------------------------------------------------------------------- # Projection on to the first 2 linear discriminant components print("Computing Linear Discriminant Analysis projection") X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible. Creo que no es necesario t0 = time() X_lda = discriminant_analysis.LinearDiscriminantAnalysis( n_components=2).fit_transform(X2, y)
def normaliseDataset(path_to_raw_trainingset, path_to_save_loc, feature_type, chunksize): """ PCA + Normalise a provided dataset using pandas and SKLearn normalisation for rapid processing that scales linearly with input size. Pickles the normalisation object for future external test sets. --args collection_iterable (iterable): pandas DF object containing an index and column names feature_type (str): describes which feature type is being processed --returns normalised_collection (pandas dataframe): normalised training set containing an index and column names labels (list of series): vectors with target labels collection.columns (list of strings): list of column names, including labels collection_features.index (list of strings): list of index names, i.e. perturbations """ scaler = preprocessing.StandardScaler() if feature_type == "1DCNN": # all parameters here were found manually: n_components = 200 # print("This function takes ~10s to complete on 15K datapoints.\n") elif feature_type == "MOLPROPS": n_components = 750 # print("This function takes ~10m to complete on 15K datapoints.\n") elif feature_type == "PFP": n_components = 200 # print("This function takes ~10s to complete on 15K datapoints.\n") pca = decomposition.IncrementalPCA(n_components=n_components) ########################################################################################### # we need to perform incremental standardization because of the large dataset: print("Making first pass (partial fitting)..") collection_iterable = readHDF5Iterable( path_to_raw_trainingset, chunksize=chunksize) for collection in collection_iterable: # omit labels from normalisation: labels, collection_features = dropLabels(collection, feature_type) # fit the normalisation: scaler.partial_fit(collection_features) ########################################################################################### # Now with fully updated means + variances, make a second pass through the iterable and transform: print("Making second pass (partial transform + partial PCA fit)..") collection_iterable = readHDF5Iterable( path_to_raw_trainingset, chunksize=chunksize) for collection in collection_iterable: # omit labels from normalisation: labels, collection_features = dropLabels(collection, feature_type) # transform: normalised_collection = pd.DataFrame(scaler.transform(collection_features)) # now fit an incremental PCA to this chunk: pca.partial_fit(normalised_collection) # # uncomment to figure out ~ how many dims to retain for 95% VE. # # can't use n_components=0.95 in our case because we process in chunks :( # ve_ratios = pca.explained_variance_ratio_ # ve_counter = 0 # ve_cumulative = 0 # for ve in ve_ratios: # if not ve_cumulative >= 0.95: # ve_cumulative += ve # ve_counter += 1 # print("Keep", ve_counter, "to retain", ve_cumulative*100, "of variance explained.") ########################################################################################### # now with the completed PCA object; go over iterable one last time; # apply normalisation and transform by PCA and save to individual files: print("Making third pass (normalise and PCA transform)..") collection_iterable = readHDF5Iterable( path_to_raw_trainingset, chunksize=chunksize) if os.path.exists(path_to_save_loc+feature_type+"/data.h5"): os.remove(path_to_save_loc+feature_type+"/data.h5") store = pd.HDFStore(path_to_save_loc+feature_type+"/data.h5") for collection in collection_iterable: # this is our final transform; save perturbation names: perturbation_indeces = collection.index # omit labels from normalisation: labels, collection_features = dropLabels(collection, feature_type) # normalise transform: normalised_collection = pd.DataFrame(scaler.transform(collection_features)) # PCA transform to finish preprocessing: processed_collection = pca.transform(normalised_collection) # prettify the np matrix back into DF and append to HDF: num_PCA_dims = len(processed_collection[0]) PCA_column_headers = [ "PC"+str(dim) for dim in range(num_PCA_dims)] pca_data_df = pd.DataFrame( processed_collection, index=perturbation_indeces, columns=PCA_column_headers ) complete_preprocessed_df = pd.concat([pca_data_df, labels], axis=1, sort=False) store.append( path_to_save_loc+feature_type+"/data.h5", complete_preprocessed_df, format="table", index=False, ) store.close() # finally, save both the standardscaler and PCA object for transforming test datasets: pickle.dump(scaler, open(path_to_save_loc+"PICKLES/"+feature_type+"_scaler.pkl","wb")) pickle.dump(pca, open(path_to_save_loc+"PICKLES/"+feature_type+"_pca.pkl","wb"))
def correct_reference(m, wn, a, d, w_regions): """ Correct reference spectrum as in Kohler's method :param m: reference spectrum :param wn: wavenumbers :param a: Average refractive index range :param d: Cell diameter range :param w_regions: Weighted regions :return: corrected reference spectrum """ n_components = 6 # Set the number of principal components # Copy the input variables m = np.copy(m) wn = np.copy(wn) # Compute the alpha range: alpha = 4.0 * np.pi * 0.5 * np.linspace( np.min(d) * (np.min(a) - 1.0), np.max(d) * (np.max(a) - 1.0), 150) p0 = np.ones(1 + n_components) # Initial guess for the fitting # Compute extinction matrix Q_ext = np.zeros((np.size(alpha), np.size(wn))) for i in range(np.size(alpha)): Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i]) # Perform PCA to Q_ext pca = skl_decomposition.IncrementalPCA(n_components=n_components) pca.fit(Q_ext) p_i = pca.components_ # Get the principal components of the extinction matrix # Get the weighted regions of the wavenumbers, the reference spectrum and the principal components w_indexes = [] for pair in w_regions: min_pair = min(pair) max_pair = max(pair) ii1 = find_nearest_number_index(wn, min_pair) ii2 = find_nearest_number_index(wn, max_pair) w_indexes.extend(np.arange(ii1, ii2)) wn_w = np.copy(wn[w_indexes]) m_w = np.copy(m[w_indexes]) p_i_w = np.copy(p_i[:, w_indexes]) def min_fun(x): """ Function to be minimized for the fitting :param x: offset and PCA scores :return: difference between the spectrum and its fitting """ cc, g = x[0], x[1:] # Return the squared norm of the difference between the reference spectrum and its fitting: return np.linalg.norm( m_w - reference_spectrum_fit_function(wn_w, p_i_w, cc, g))**2.0 # Perform the minimization using Powell method res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell') c, g_i = res.x[0], res.x[1:] # Obtain the fitted parameters # Apply the correction: m_corr = np.zeros(np.shape(m)) for i in range(len(wn)): sum1 = 0 for j in range(len(g_i)): sum1 += g_i[j] * p_i[j][i] m_corr[i] = (m[i] - c - sum1) return m_corr # Return the corrected spectrum
print meta_df['pert_id'].nunique() print meta_df['perturbation'].nunique() # meta_df.to_csv('data/metadata.tsv', sep='\t') meta_df.to_csv('data/metadata-sig-only.tsv', sep='\t') mask_shared = np.in1d(sig_ids, meta_df.index.tolist()) mat = mat[mask_shared, :] # mat = pairwise_distances(mat, metric='cosine') # z-score norm print mat.shape batch_size = 400 scl = preprocessing.StandardScaler() ipca = decomposition.IncrementalPCA(n_components=3, batch_size=None) n_batchs = int(math.ceil(mat.shape[0] / float(batch_size))) # for i in range(n_batchs): # start_idx = i * batch_size # end_idx = (i+1) * batch_size # scl.partial_fit(mat[start_idx:end_idx]) for i in range(n_batchs): start_idx = i * batch_size end_idx = (i + 1) * batch_size # scaled_sub_mat = scl.transform(mat[start_idx:end_idx]) # ipca.partial_fit(scaled_sub_mat) ipca.partial_fit(mat[start_idx:end_idx])
def find_components(input_data, n_components=2, method='pca', **kwargs): ''' Extract components from an array of data input_data: np.array The input data matrix n_comps : int The number of components to extract method : str The dimensionality reduction technique to use kwargs : optional arguments to pass to the construction of the estimator Note: this function is basically a wrapper for a bunch of the standard estimators found in sklearn. Please refer to the sklearn documentation for the keyword arguments to pass to the various estimators. http://scikit-learn.org/stable/modules/decomposition.html Examples -------- >>>components = find_components(data,method='k-means',tol=1e-3, batch_size=100, max_iter=50) >>>plot(components.T) >>>components = find_components(copy(all_traces.T),method='incremental pca',batch_size=100) >>>plot(components.T) DEV: Automatically compute the batch sizes for incremental and minibatch PCA "The computational overhead of each SVD is O(batch_size * n_features ** 2), but only 2 * batch_size samples remain in memory at a time. There will be n_samples / batch_size SVD computations to get the principal components, versus 1 large SVD of complexity O(n_samples * n_features ** 2) for PCA." Issues ------ LDA returns components that are not orthogonal; need to apply Gram-Schmidt Kernel PCA is currently broken, also LDA is erratic http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html#sklearn.decomposition.KernelPCA ''' if not has_sklearn: warnings.warn( 'scikit-learn not found. This function will not work correctly.') return None n_samples, n_features = input_data.shape rng = RandomState(0) if n_samples < n_features: warnings.warn( 'More features than samples; assuming input data matrix was transposed.' ) input_data = input_data.T n_samples, n_features = n_features, n_samples if method in [ 'pca', 'ica', 'k-means', 'incremental pca', 'kernel pca', 'random pca' ]: data = input_data - input_data.mean(axis=0) else: data = input_data if method == 'pca': estimator = decomposition.PCA(n_components=n_components, **kwargs) elif method == 'k-means': estimator = MiniBatchKMeans(n_clusters=n_components, random_state=rng, **kwargs) elif method == 'ica': estimator = decomposition.FastICA(n_components=n_components, whiten=True, **kwargs) elif method == 'incremental pca': estimator = decomposition.IncrementalPCA(n_components=n_components, whiten=True) elif method == 'svd': estimator = decomposition.TruncatedSVD(n_components=n_components, random_state=rng, **kwargs) elif method == 'kernel pca': estimator = decomposition.KernelPCA(n_components=n_components, **kwargs) elif method == 'lda': estimator = decomposition.LatentDirichletAllocation( n_topics=n_components, random_state=rng, **kwargs) data = data + abs(min(ravel(data))) elif method == 'random pca': estimator = decomposition.RandomizedPCA(n_components=n_components, whiten=True, **kwargs) else: warnings.warn('Unknown \'method\' argument given; falling back to PCA') estimator = decomposition.PCA(n_components=n_components) estimator.fit(data) if hasattr(estimator, 'cluster_centers_'): components = estimator.cluster_centers_ else: components = estimator.components_ return components
def Kohler(wavenumbers, App, m0, n_components=8): """ Correct scattered spectra using Kohler's algorithm :param wavenumbers: array of wavenumbers :param App: apparent spectrum :param m0: reference spectrum :param n_components: number of principal components to be calculated :return: corrected data """ # Make copies of all input data: wn = np.copy(wavenumbers) A_app = np.copy(App) m_0 = np.copy(m0) ii = np.argsort(wn) # Sort the wavenumbers from smallest to largest # Sort all the input variables accordingly wn = wn[ii] A_app = A_app[ii] m_0 = m_0[ii] # Initialize the alpha parameter: alpha = np.linspace( 3.14, 49.95, 150) * 1.0e-4 # alpha = 2 * pi * d * (n - 1) * wavenumber p0 = np.ones(2 + n_components) # Initialize the initial guess for the fitting # # Initialize the extinction matrix: Q_ext = np.zeros((np.size(alpha), np.size(wn))) for i in range(np.size(alpha)): Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i]) # Perform PCA of Q_ext: pca = skl_decomposition.IncrementalPCA(n_components=n_components) pca.fit(Q_ext) p_i = pca.components_ # Extract the principal components # print(np.sum(pca.explained_variance_ratio_)*100) # Print th explained variance ratio in percentage def min_fun(x): """ Function to be minimized by the fitting :param x: array containing the reference linear factor, the offset, and the PCA scores :return: function to be minimized """ bb, cc, g = x[0], x[1], x[2:] # Return the squared norm of the difference between the apparent spectrum and the fit return np.linalg.norm( A_app - apparent_spectrum_fit_function(wn, m_0, p_i, bb, cc, g))**2.0 # Minimize the function using Powell method res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell') # print(res) # Print the minimization result # assert(res.success) # Raise AssertionError if res.success == False b, c, g_i = res.x[0], res.x[1], res.x[2:] # Obtain the fitted parameters # Apply the correction to the apparent spectrum Z_corr = np.zeros(np.shape(m_0)) for i in range(len(wavenumbers)): sum1 = 0 for j in range(len(g_i)): sum1 += g_i[j] * p_i[j][i] Z_corr[i] = (A_app[i] - c - sum1) / b return Z_corr[:: -1] # Return the correction in reverse order for compatibility
def Kohler_zero(wavenumbers, App, w_regions, n_components=8): """ Correct scattered spectra using Kohler's algorithm :param wavenumbers: array of wavenumbers :param App: apparent spectrum :param m0: reference spectrum :param n_components: number of principal components to be calculated :return: corrected data """ # Make copies of all input data: wn = np.copy(wavenumbers) A_app = np.copy(App) m_0 = np.zeros(len(wn)) ii = np.argsort(wn) # Sort the wavenumbers from smallest to largest # Sort all the input variables accordingly wn = wn[ii] A_app = A_app[ii] m_0 = m_0[ii] # Initialize the alpha parameter: alpha = np.linspace( 1.25, 49.95, 150) * 1.0e-4 # alpha = 2 * pi * d * (n - 1) * wavenumber p0 = np.ones(2 + n_components) # Initialize the initial guess for the fitting # # Initialize the extinction matrix: Q_ext = np.zeros((np.size(alpha), np.size(wn))) for i in range(np.size(alpha)): Q_ext[i][:] = Q_ext_kohler(wn, alpha=alpha[i]) # Perform PCA of Q_ext: pca = skl_decomposition.IncrementalPCA(n_components=n_components) pca.fit(Q_ext) p_i = pca.components_ # Extract the principal components # print(np.sum(pca.explained_variance_ratio_)*100) # Print th explained variance ratio in percentage w_indexes = [] for pair in w_regions: min_pair = min(pair) max_pair = max(pair) ii1 = find_nearest_number_index(wn, min_pair) ii2 = find_nearest_number_index(wn, max_pair) w_indexes.extend(np.arange(ii1, ii2)) wn_w = np.copy(wn[w_indexes]) A_app_w = np.copy(A_app[w_indexes]) m_w = np.copy(m_0[w_indexes]) p_i_w = np.copy(p_i[:, w_indexes]) def min_fun(x): """ Function to be minimized by the fitting :param x: array containing the reference linear factor, the offset, and the PCA scores :return: function to be minimized """ bb, cc, g = x[0], x[1], x[2:] # Return the squared norm of the difference between the apparent spectrum and the fit return np.linalg.norm( A_app_w - apparent_spectrum_fit_function(wn_w, m_w, p_i_w, bb, cc, g))**2.0 # Minimize the function using Powell method res = scipy.optimize.minimize(min_fun, p0, bounds=None, method='Powell') # print(res) # Print the minimization result # assert(res.success) # Raise AssertionError if res.success == False b, c, g_i = res.x[0], res.x[1], res.x[2:] # Obtain the fitted parameters # Apply the correction to the apparent spectrum Z_corr = (A_app - c - np.dot(g_i, p_i)) # Apply the correction base = np.dot(g_i, p_i) return Z_corr, base