def run_gpu(X_train, X_test, y_train, y_test, scaled=True): import cupy as cp # Initialize models t0 = time.time() if scaled: ss = sklearn.preprocessing.StandardScaler() pca = cuml.PCA(n_components=2) knc = cuml.neighbors.KNeighborsClassifier() # Train models if scaled: X_train_ = ss.fit_transform(X_train) else: X_train_ = X_train X_train_ = cp.array(X_train_) X_train_ = pca.fit_transform(X_train_) y_train_ = cp.array(y_train) knc.fit(X_train_, y_train_) print('Fit(gpu):', time.time() - t0) # Test models t0 = time.time() if scaled: X_test_ = ss.transform(X_test) else: X_test_ = X_test X_test_ = cp.array(X_test_) X_test_ = pca.transform(X_test_) pred_test = knc.predict(X_test_) pred_test = pred_test.to_pandas().to_numpy() print('Predict(gpu):', time.time() - t0) return pred_test
def get_PCA_prjs(X, cpu=False, **kwargs): r""" Computes PCA projections of X """ if cpu: raise NotImplementedError else: reducer = cuml.PCA(**kwargs) projections = reducer.fit_transform(X) return projections
def cuml_pca(config, feature, components=10): # Import RAPIDS import cudf, cuml num_fr = feature.shape[0] embed = np.zeros((num_fr, components)) df = cudf.DataFrame(feature) pca = cuml.PCA(n_components=components, svd_solver='jacobi') pca.fit(df) cu_embed = pca.transform(df) exp_var = pca.explained_variance_ratio_.to_pandas().to_numpy() embed[:, 0:components] = cu_embed.to_pandas().to_numpy() print("*** PCA ***") print(exp_var) print(f"Sum of Explained Variance: {np.sum(exp_var)}") return embed, exp_var
def reduce_to_3D(data, labels, dimReductionMethod, trainedEmbeddingModel=None): startTime = time.time() preTrainedStr = '' ''' if dimReductionMethod == 'TSNE': embeddingModel = None embeddedData = cuml.TSNE( n_components = 2 ).fit_transform ( X = data ) embeddedData.add_column('3', cudf.Series(np.zeros((data.shape[0]))) ) else: ''' if trainedEmbeddingModel is not None: preTrainedStr = 'pre-trained ' embeddingModel = trainedEmbeddingModel else: if dimReductionMethod == 'PCA': embeddingModel = cuml.PCA(copy=True, n_components=3, random_state=0, svd_solver='full', verbose=True, whiten=False).fit(X=data) elif dimReductionMethod == 'UMAP': embeddingModel = cuml.UMAP(n_components=3).fit(X=data, y=labels) else: assert ('unable to find embedding model match to user query') embeddedData = embeddingModel.transform(X=data) elapsedTime = time.time() - startTime print( f'{embeddedData.shape} via {preTrainedStr}{dimReductionMethod} -- completed in: {elapsedTime:.3f} seconds' ) return embeddedData, embeddingModel
from sklearn.datasets import load_iris from sklearn.datasets import make_regression import pickle from sklearn.manifold.t_sne import trustworthiness regression_models = dict(LinearRegression=cuml.LinearRegression(), Lasso=cuml.Lasso(), Ridge=cuml.Ridge(), ElasticNet=cuml.ElasticNet()) solver_models = dict(CD=cuml.CD(), SGD=cuml.SGD(eta0=0.005)) cluster_models = dict(KMeans=cuml.KMeans()) decomposition_models = dict( PCA=cuml.PCA(), TruncatedSVD=cuml.TruncatedSVD(), ) decomposition_models_xfail = dict( GaussianRandomProjection=cuml.GaussianRandomProjection(), SparseRandomProjection=cuml.SparseRandomProjection()) neighbor_models = dict(NearestNeighbors=cuml.NearestNeighbors()) dbscan_model = dict(DBSCAN=cuml.DBSCAN()) umap_model = dict(UMAP=cuml.UMAP()) def unit_param(*args, **kwargs):
"Ridge": lambda fit_intercept=True: cuml.Ridge(fit_intercept=fit_intercept), "ElasticNet": lambda fit_intercept=True: cuml.ElasticNet(fit_intercept=fit_intercept) } solver_models = { "CD": lambda: cuml.CD(), "SGD": lambda: cuml.SGD(eta0=0.005), "QN": lambda: cuml.QN(loss="softmax") } cluster_models = {"KMeans": lambda: cuml.KMeans()} decomposition_models = { "PCA": lambda: cuml.PCA(), "TruncatedSVD": lambda: cuml.TruncatedSVD(), } decomposition_models_xfail = { "GaussianRandomProjection": lambda: cuml.GaussianRandomProjection(), "SparseRandomProjection": lambda: cuml.SparseRandomProjection() } neighbor_models = {"NearestNeighbors": lambda: cuml.NearestNeighbors()} dbscan_model = {"DBSCAN": lambda: cuml.DBSCAN()} umap_model = {"UMAP": lambda: cuml.UMAP()} rf_models = {
if count > max_molecule: break logger.info('Initializing Morgan fingerprints...') results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute() np_fingerprints = np.stack(results).astype(np.float32) # take np.array shape (n_mols, nBits) for GPU DataFrame df_fingerprints = np2dataframe(np_fingerprints, enable_gpu) # prepare one set of clusters if pca_components: task_start_time = datetime.now() if enable_gpu: pca = cuml.PCA(n_components=pca_components) else: pca = sklearn.decomposition.PCA(n_components=pca_components) df_fingerprints = pca.fit_transform(df_fingerprints) print('Runtime PCA time (hh:mm:ss.ms) {}'.format(datetime.now() - task_start_time)) else: pca = False print('PCA has been skipped') task_start_time = datetime.now() n_clusters = 7 if enable_gpu: kmeans_float = cuml.KMeans(n_clusters=n_clusters) else:
def _cluster(self, embedding, n_pca): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. """ if hasattr(embedding, 'compute'): embedding = embedding.compute() embedding = embedding.reset_index() # Before reclustering remove all columns that may interfere embedding, prop_series = self._remove_non_numerics(embedding) self.n_molecules, n_obs = embedding.shape if self.context.is_benchmark: molecular_embedding_sample, spearman_index = self._random_sample_from_arrays( embedding, n_samples=self.n_spearman) if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: if self.pca == None: self.pca = cuml.PCA(n_components=n_pca) self.pca.fit(embedding) embedding = self.pca.transform(embedding) with MetricsLogger('kmeans', self.n_molecules) as ml: if self.n_molecules < MIN_RECLUSTER_SIZE: raise Exception( 'Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) kmeans_cuml = cuml.KMeans(n_clusters=self.n_clusters) kmeans_cuml.fit(embedding) kmeans_labels = kmeans_cuml.predict(embedding) ml.metric_name = 'silhouette_score' ml.metric_func = batched_silhouette_scores ml.metric_func_kwargs = {} ml.metric_func_args = (None, None) if self.context.is_benchmark: (embedding_sample, kmeans_labels_sample), _ = self._random_sample_from_arrays( embedding, kmeans_labels, n_samples=self.n_silhouette) ml.metric_func_args = (embedding_sample, kmeans_labels_sample) with MetricsLogger('umap', self.n_molecules) as ml: umap = cuml.manifold.UMAP() Xt = umap.fit_transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho ml.metric_func_args = (None, None) if self.context.is_benchmark: X_train_sample, _ = self._random_sample_from_arrays( embedding, index=spearman_index) ml.metric_func_args = (molecular_embedding_sample, X_train_sample) # Add back the column required for plotting and to correlating data # between re-clustering embedding['cluster'] = kmeans_labels embedding['x'] = Xt[0] embedding['y'] = Xt[1] # Add back the prop columns for col in prop_series.keys(): embedding[col] = prop_series[col] return embedding