def TFIDF_feateure(df, min_PCA = 5000): if IS_GPU: df_cu = cudf.DataFrame(df) else: df_cu = df max_features = 15000 n_components = min(min_PCA, len(df_cu)) nlp_model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features) text_embeddings = nlp_model.fit_transform(df_cu['title']).toarray() pca = PCA(n_components = n_components) if IS_GPU: text_embeddings = pca.fit_transform(text_embeddings).get() else: text_embeddings = pca.fit_transform(text_embeddings) print(f'Our title text embedding shape is {text_embeddings.shape}') return text_embeddings
def test_model_func_call_gpu(): X, y = make_regression(n_samples=81, n_features=10, noise=0.1, random_state=42, dtype=np.float32) model = reg().fit(X, y) z = model_func_call(X=X, model_func=model.predict, gpu_model=True) assert isinstance(z, cp.ndarray) z = model_func_call(X=cp.asnumpy(X), model_func=dummy_func, gpu_model=False) assert isinstance(z, cp.ndarray) with pytest.raises(TypeError): z = model_func_call(X=X, model_func=dummy_func, gpu_model=True) model = PCA(n_components=10).fit(X) z = model_func_call(X=X, model_func=model.transform, gpu_model=True) assert isinstance(z, cp.ndarray)
def run_pca(data, device, n_components=300, var_explained=0.85): """Run PCA :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components :param n_components: Number of principal components :param var_explained: Include components that explain amount variance. Note number of components = min(n_components, components explaining var_explained) :return: PCA projections of the data and the explained variance """ init_components = min([n_components, data.shape[0]]) if device == "gpu": from cuml import PCA pca = PCA(n_components=init_components) elif device == "cpu": from sklearn.decomposition import PCA pca = PCA(n_components=init_components, svd_solver='randomized') pca.fit(data) if pca.explained_variance_ratio_.sum() >= 0.85: n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= var_explained)[0][0] print(f'Running PCA with {n_components} components') pca_projections = pca.fit_transform(data) pca_projections = pd.DataFrame(pca_projections, index=data.index) return pca_projections, pca.explained_variance_ratio_
def PCA_concat(df, n_components=2): pca_float = PCA(n_components=n_components) pca_float.fit(df[df.columns[df.dtypes == np.float32]]) scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]]) scores.columns = ['PC' + str(x) for x in range(n_components)] return cudf.concat([df, scores], axis=1)
def PCA_concat(df, components=100): pca_float = PCA(n_components=2) pca_float.fit(df[df.columns[df.dtypes == np.float32]]) scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]]) return cudf.concat([df, scores], axis=1)
parser.add_argument('--whiten', action='store_true', default=False, help='Perform whitening') params = bench.parse_args(parser) # Load random data X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train']) if params.n_components is None: p, n = X_train.shape params.n_components = min((n, (2 + min((n, p))) // 3)) # Create our PCA object pca = PCA(svd_solver=params.svd_solver, whiten=params.whiten, n_components=params.n_components) # Time fit fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params) # Time transform transform_time, _ = bench.measure_function_time(pca.transform, X_train, params=params) bench.print_output(library='cuml', algorithm='pca', stages=['training', 'transformation'], params=params, functions=['PCA.fit', 'PCA.transform'],
#data = data[0:10000] indices = [x for x in range(len(data))] #random.shuffle(indices) data2 = data #data2 = np.copy(data) #for i1, i2 in zip(range(len(data)), indices): # data2[i1] = data[i2] pcaFile = 'pca.np' perComp = 100000 comps = len(data) // perComp + 1 pcaComps = 20 if 0: if not os.path.exists(pcaFile): data3 = np.zeros((len(data), pcaComps), np.float32) pca = PCA(n_components=pcaComps) for i in range(comps): data2 = data[i * perComp:(i + 1) * perComp] data2 = pca.fit_transform(data2) data3[i * perComp:(i + 1) * perComp, :] = data2 data2 = data3 del pca fd = open(pcaFile, 'wb') fd.write(data2.flatten().tobytes()) fd.close() else: l = len(data) del data del data2 data2 = np.fromfile(pcaFile, np.float32).reshape((l, pcaComps))
# Extract embedding vectors load_kwargs = {'batch_size': 128, 'num_workers': os.cpu_count()} # test_embs, _ = extract_embeddings(emb_net, DataLoader(test_ds, **load_kwargs)) embs, labels = extract_embeddings(emb_net, DataLoader(train_ds, **load_kwargs)) # translate them to cpu + numpy embs = embs.cpu().numpy() labels = labels.cpu().numpy() # ----------------------------------------------------------------------------- print("Plotting T-sne....") from cuml.manifold import TSNE tsne = TSNE(n_iter=1000, metric="euclidean") projected_emb = tsne.fit_transform(embs) fig = plot_embeddings(projected_emb, labels) png_fname = join(exp_folder, 't-sne.png') fig.savefig(png_fname, bbox_inches='tight') pdf_fname = join(exp_folder, 't-sne.pdf') fig.savefig(pdf_fname, bbox_inches='tight') # ----------------------------------------------------------------------------- print("Plotting PCA....") from cuml import PCA pca_float = PCA(n_components=2) cudf = pca_float.fit_transform(embs) projected_emb = cudf.to_pandas().to_numpy() fig = plot_embeddings(projected_emb, labels) png_fname = join(exp_folder, 'pca.png') fig.savefig(png_fname, bbox_inches='tight') pdf_fname = join(exp_folder, 't-sne.pdf') fig.savefig(pdf_fname, bbox_inches='tight')