Example #1
0
def TFIDF_feateure(df, min_PCA = 5000):
    if IS_GPU:
        df_cu = cudf.DataFrame(df)
    else:
        df_cu = df
    max_features = 15000
    n_components = min(min_PCA, len(df_cu))
    nlp_model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = nlp_model.fit_transform(df_cu['title']).toarray()
    pca = PCA(n_components = n_components)
    if IS_GPU:
        text_embeddings = pca.fit_transform(text_embeddings).get()
    else:
        text_embeddings = pca.fit_transform(text_embeddings)
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    return text_embeddings
Example #2
0
def test_model_func_call_gpu():
    X, y = make_regression(n_samples=81, n_features=10, noise=0.1,
                           random_state=42, dtype=np.float32)

    model = reg().fit(X, y)

    z = model_func_call(X=X,
                        model_func=model.predict,
                        gpu_model=True)

    assert isinstance(z, cp.ndarray)

    z = model_func_call(X=cp.asnumpy(X),
                        model_func=dummy_func,
                        gpu_model=False)

    assert isinstance(z, cp.ndarray)

    with pytest.raises(TypeError):
        z = model_func_call(X=X,
                            model_func=dummy_func,
                            gpu_model=True)

    model = PCA(n_components=10).fit(X)

    z = model_func_call(X=X,
                        model_func=model.transform,
                        gpu_model=True)

    assert isinstance(z, cp.ndarray)
Example #3
0
def run_pca(data, device, n_components=300, var_explained=0.85):
    """Run PCA

    :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components
    :param n_components: Number of principal components
    :param var_explained: Include components that explain amount variance. Note
    number of components = min(n_components, components explaining var_explained)
    :return: PCA projections of the data and the explained variance
    """
    init_components = min([n_components, data.shape[0]])
    if device == "gpu":
        from cuml import PCA
        pca = PCA(n_components=init_components)
    elif device == "cpu":
        from sklearn.decomposition import PCA
        pca = PCA(n_components=init_components, svd_solver='randomized')
    pca.fit(data)
    if pca.explained_variance_ratio_.sum() >= 0.85:
        n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= var_explained)[0][0]

    print(f'Running PCA with {n_components} components')
    pca_projections = pca.fit_transform(data)
    pca_projections = pd.DataFrame(pca_projections, index=data.index)
    return pca_projections, pca.explained_variance_ratio_
Example #4
0
def PCA_concat(df, n_components=2):
    pca_float = PCA(n_components=n_components)
    pca_float.fit(df[df.columns[df.dtypes == np.float32]])
    scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]])
    scores.columns = ['PC' + str(x) for x in range(n_components)]
    return cudf.concat([df, scores], axis=1)
Example #5
0
def PCA_concat(df, components=100):
    pca_float = PCA(n_components=2)
    pca_float.fit(df[df.columns[df.dtypes == np.float32]])
    scores = pca_float.transform(df[df.columns[df.dtypes == np.float32]])
    return cudf.concat([df, scores], axis=1)
Example #6
0
parser.add_argument('--whiten',
                    action='store_true',
                    default=False,
                    help='Perform whitening')
params = bench.parse_args(parser)

# Load random data
X_train, X_test, _, _ = bench.load_data(params, generated_data=['X_train'])

if params.n_components is None:
    p, n = X_train.shape
    params.n_components = min((n, (2 + min((n, p))) // 3))

# Create our PCA object
pca = PCA(svd_solver=params.svd_solver,
          whiten=params.whiten,
          n_components=params.n_components)

# Time fit
fit_time, _ = bench.measure_function_time(pca.fit, X_train, params=params)

# Time transform
transform_time, _ = bench.measure_function_time(pca.transform,
                                                X_train,
                                                params=params)

bench.print_output(library='cuml',
                   algorithm='pca',
                   stages=['training', 'transformation'],
                   params=params,
                   functions=['PCA.fit', 'PCA.transform'],
Example #7
0
#data = data[0:10000]
indices = [x for x in range(len(data))]
#random.shuffle(indices)
data2 = data
#data2 = np.copy(data)
#for i1, i2 in zip(range(len(data)), indices):
#    data2[i1] = data[i2]

pcaFile = 'pca.np'
perComp = 100000
comps = len(data) // perComp + 1
pcaComps = 20
if 0:
    if not os.path.exists(pcaFile):
        data3 = np.zeros((len(data), pcaComps), np.float32)
        pca = PCA(n_components=pcaComps)
        for i in range(comps):
            data2 = data[i * perComp:(i + 1) * perComp]
            data2 = pca.fit_transform(data2)
            data3[i * perComp:(i + 1) * perComp, :] = data2
        data2 = data3
        del pca
        fd = open(pcaFile, 'wb')
        fd.write(data2.flatten().tobytes())
        fd.close()
    else:
        l = len(data)
        del data
        del data2
        data2 = np.fromfile(pcaFile, np.float32).reshape((l, pcaComps))
Example #8
0
# Extract embedding vectors
load_kwargs = {'batch_size': 128, 'num_workers': os.cpu_count()}

# test_embs, _ = extract_embeddings(emb_net, DataLoader(test_ds, **load_kwargs))
embs, labels = extract_embeddings(emb_net, DataLoader(train_ds, **load_kwargs))

# translate them to cpu + numpy
embs = embs.cpu().numpy()
labels = labels.cpu().numpy()
# -----------------------------------------------------------------------------
print("Plotting T-sne....")
from cuml.manifold import TSNE
tsne = TSNE(n_iter=1000, metric="euclidean")
projected_emb = tsne.fit_transform(embs)
fig = plot_embeddings(projected_emb, labels)
png_fname = join(exp_folder, 't-sne.png')
fig.savefig(png_fname, bbox_inches='tight')
pdf_fname = join(exp_folder, 't-sne.pdf')
fig.savefig(pdf_fname, bbox_inches='tight')
# -----------------------------------------------------------------------------
print("Plotting PCA....")
from cuml import PCA
pca_float = PCA(n_components=2)
cudf = pca_float.fit_transform(embs)
projected_emb = cudf.to_pandas().to_numpy()
fig = plot_embeddings(projected_emb, labels)
png_fname = join(exp_folder, 'pca.png')
fig.savefig(png_fname, bbox_inches='tight')
pdf_fname = join(exp_folder, 't-sne.pdf')
fig.savefig(pdf_fname, bbox_inches='tight')