Esempio n. 1
0
def run_gpu(X_train, X_test, y_train, y_test, scaled=True):
    import cupy as cp

    # Initialize models
    t0 = time.time()
    if scaled:
        ss = sklearn.preprocessing.StandardScaler()
    pca = cuml.PCA(n_components=2)
    knc = cuml.neighbors.KNeighborsClassifier()

    # Train models
    if scaled:
        X_train_ = ss.fit_transform(X_train)
    else:
        X_train_ = X_train
    X_train_ = cp.array(X_train_)
    X_train_ = pca.fit_transform(X_train_)
    y_train_ = cp.array(y_train)
    knc.fit(X_train_, y_train_)
    print('Fit(gpu):', time.time() - t0)

    # Test models
    t0 = time.time()
    if scaled:
        X_test_ = ss.transform(X_test)
    else:
        X_test_ = X_test
    X_test_ = cp.array(X_test_)
    X_test_ = pca.transform(X_test_)
    pred_test = knc.predict(X_test_)
    pred_test = pred_test.to_pandas().to_numpy()
    print('Predict(gpu):', time.time() - t0)
    return pred_test
Esempio n. 2
0
def get_PCA_prjs(X, cpu=False, **kwargs):
    r"""
    Computes PCA projections of X
    """
    if cpu:
        raise NotImplementedError
    else:
        reducer = cuml.PCA(**kwargs)
    projections = reducer.fit_transform(X)
    return projections
Esempio n. 3
0
def cuml_pca(config, feature, components=10):
    # Import RAPIDS
    import cudf, cuml

    num_fr = feature.shape[0]
    embed = np.zeros((num_fr, components))
    df = cudf.DataFrame(feature)
    pca = cuml.PCA(n_components=components, svd_solver='jacobi')
    pca.fit(df)
    cu_embed = pca.transform(df)
    exp_var = pca.explained_variance_ratio_.to_pandas().to_numpy()
    embed[:, 0:components] = cu_embed.to_pandas().to_numpy()

    print("*** PCA ***")
    print(exp_var)
    print(f"Sum of Explained Variance: {np.sum(exp_var)}")

    return embed, exp_var
Esempio n. 4
0
def reduce_to_3D(data, labels, dimReductionMethod, trainedEmbeddingModel=None):

    startTime = time.time()

    preTrainedStr = ''
    '''
    if dimReductionMethod == 'TSNE':        
        embeddingModel = None
        embeddedData = cuml.TSNE( n_components = 2 ).fit_transform ( X = data )
        embeddedData.add_column('3', cudf.Series(np.zeros((data.shape[0]))) )
    else:
    '''
    if trainedEmbeddingModel is not None:
        preTrainedStr = 'pre-trained '
        embeddingModel = trainedEmbeddingModel
    else:
        if dimReductionMethod == 'PCA':
            embeddingModel = cuml.PCA(copy=True,
                                      n_components=3,
                                      random_state=0,
                                      svd_solver='full',
                                      verbose=True,
                                      whiten=False).fit(X=data)

        elif dimReductionMethod == 'UMAP':
            embeddingModel = cuml.UMAP(n_components=3).fit(X=data, y=labels)
        else:
            assert ('unable to find embedding model match to user query')

    embeddedData = embeddingModel.transform(X=data)

    elapsedTime = time.time() - startTime
    print(
        f'{embeddedData.shape} via {preTrainedStr}{dimReductionMethod} -- completed in: {elapsedTime:.3f} seconds'
    )

    return embeddedData, embeddingModel
Esempio n. 5
0
from sklearn.datasets import load_iris
from sklearn.datasets import make_regression
import pickle
from sklearn.manifold.t_sne import trustworthiness

regression_models = dict(LinearRegression=cuml.LinearRegression(),
                         Lasso=cuml.Lasso(),
                         Ridge=cuml.Ridge(),
                         ElasticNet=cuml.ElasticNet())

solver_models = dict(CD=cuml.CD(), SGD=cuml.SGD(eta0=0.005))

cluster_models = dict(KMeans=cuml.KMeans())

decomposition_models = dict(
    PCA=cuml.PCA(),
    TruncatedSVD=cuml.TruncatedSVD(),
)

decomposition_models_xfail = dict(
    GaussianRandomProjection=cuml.GaussianRandomProjection(),
    SparseRandomProjection=cuml.SparseRandomProjection())

neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors())

dbscan_model = dict(DBSCAN=cuml.DBSCAN())

umap_model = dict(UMAP=cuml.UMAP())


def unit_param(*args, **kwargs):
Esempio n. 6
0
    "Ridge":
    lambda fit_intercept=True: cuml.Ridge(fit_intercept=fit_intercept),
    "ElasticNet":
    lambda fit_intercept=True: cuml.ElasticNet(fit_intercept=fit_intercept)
}

solver_models = {
    "CD": lambda: cuml.CD(),
    "SGD": lambda: cuml.SGD(eta0=0.005),
    "QN": lambda: cuml.QN(loss="softmax")
}

cluster_models = {"KMeans": lambda: cuml.KMeans()}

decomposition_models = {
    "PCA": lambda: cuml.PCA(),
    "TruncatedSVD": lambda: cuml.TruncatedSVD(),
}

decomposition_models_xfail = {
    "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(),
    "SparseRandomProjection": lambda: cuml.SparseRandomProjection()
}

neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()}

dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()}

umap_model = {"UMAP": lambda: cuml.UMAP()}

rf_models = {
Esempio n. 7
0
            if count > max_molecule:
                break

    logger.info('Initializing Morgan fingerprints...')
    results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute()

    np_fingerprints = np.stack(results).astype(np.float32)

    # take np.array shape (n_mols, nBits) for GPU DataFrame
    df_fingerprints = np2dataframe(np_fingerprints, enable_gpu)

    # prepare one set of clusters
    if pca_components:
        task_start_time = datetime.now()
        if enable_gpu:
            pca = cuml.PCA(n_components=pca_components)
        else:
            pca = sklearn.decomposition.PCA(n_components=pca_components)

        df_fingerprints = pca.fit_transform(df_fingerprints)
        print('Runtime PCA time (hh:mm:ss.ms) {}'.format(datetime.now() -
                                                         task_start_time))
    else:
        pca = False
        print('PCA has been skipped')

    task_start_time = datetime.now()
    n_clusters = 7
    if enable_gpu:
        kmeans_float = cuml.KMeans(n_clusters=n_clusters)
    else:
Esempio n. 8
0
    def _cluster(self, embedding, n_pca):
        """
        Generates UMAP transformation on Kmeans labels generated from
        molecular fingerprints.
        """
        if hasattr(embedding, 'compute'):
            embedding = embedding.compute()

        embedding = embedding.reset_index()

        # Before reclustering remove all columns that may interfere
        embedding, prop_series = self._remove_non_numerics(embedding)
        self.n_molecules, n_obs = embedding.shape

        if self.context.is_benchmark:
            molecular_embedding_sample, spearman_index = self._random_sample_from_arrays(
                embedding, n_samples=self.n_spearman)

        if n_pca and n_obs > n_pca:
            with MetricsLogger('pca', self.n_molecules) as ml:
                if self.pca == None:
                    self.pca = cuml.PCA(n_components=n_pca)
                    self.pca.fit(embedding)
                embedding = self.pca.transform(embedding)

        with MetricsLogger('kmeans', self.n_molecules) as ml:
            if self.n_molecules < MIN_RECLUSTER_SIZE:
                raise Exception(
                    'Reclustering less than %d molecules is not supported.' %
                    MIN_RECLUSTER_SIZE)

            kmeans_cuml = cuml.KMeans(n_clusters=self.n_clusters)
            kmeans_cuml.fit(embedding)
            kmeans_labels = kmeans_cuml.predict(embedding)

            ml.metric_name = 'silhouette_score'
            ml.metric_func = batched_silhouette_scores
            ml.metric_func_kwargs = {}
            ml.metric_func_args = (None, None)

            if self.context.is_benchmark:
                (embedding_sample,
                 kmeans_labels_sample), _ = self._random_sample_from_arrays(
                     embedding, kmeans_labels, n_samples=self.n_silhouette)
                ml.metric_func_args = (embedding_sample, kmeans_labels_sample)

        with MetricsLogger('umap', self.n_molecules) as ml:
            umap = cuml.manifold.UMAP()
            Xt = umap.fit_transform(embedding)

            ml.metric_name = 'spearman_rho'
            ml.metric_func = self._compute_spearman_rho
            ml.metric_func_args = (None, None)
            if self.context.is_benchmark:
                X_train_sample, _ = self._random_sample_from_arrays(
                    embedding, index=spearman_index)
                ml.metric_func_args = (molecular_embedding_sample,
                                       X_train_sample)

        # Add back the column required for plotting and to correlating data
        # between re-clustering
        embedding['cluster'] = kmeans_labels
        embedding['x'] = Xt[0]
        embedding['y'] = Xt[1]

        # Add back the prop columns
        for col in prop_series.keys():
            embedding[col] = prop_series[col]

        return embedding