def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None): if gdf.shape[0] == 0: return None # Before reclustering remove all columns that may interfere ids = gdf['id'] chembl_ids = gdf['chembl_id'] gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True) if new_figerprints is not None and new_chembl_ids is not None: # Add new figerprints and chEmblIds before reclustering fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns) gdf = gdf.append(fp_df, ignore_index=True) chembl_ids = chembl_ids.append( cudf.Series(new_chembl_ids), ignore_index=True) kmeans_float = KMeans(n_clusters=self.n_clusters) kmeans_float.fit(gdf) Xt = self.umap.fit_transform(gdf) # Add back the column required for plotting and to correlating data # between re-clustering gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('id', gdf.index) gdf.add_column('chembl_id', chembl_ids) gdf.add_column('cluster', kmeans_float.labels_.to_array()) return gdf
def kmeans_fit(X): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, max_samples_per_batch=params.samples_per_batch) alg.fit(X) return alg
def kmeans(X, k, round_values=True): """ Summarize a dataset with k mean samples weighted by the number of data points they each represent. Parameters ---------- X : numpy.array or pandas.DataFrame or any scipy.sparse matrix Matrix of data samples to summarize (# samples x # features) k : int Number of means to use for approximation. round_values : bool For all i, round the ith dimension of each mean sample to match the nearest value from X[:,i]. This ensures discrete features always get a valid value. Returns ------- DenseData object. """ if not rapids_installed: raise RuntimeError( "cuML is required to use GPU explainers. Check https://rapids.ai/start.html \ for more information on how to install it.") if cuml.__version__ >= '21.08': from cuml.explainer.sampling import kmeans_sampling summary, group_names, labels = kmeans_sampling(X, k, round_values, detailed=True) return DenseData(summary, group_names, None, 1.0 * np.bincount(labels)) # For backward compatibility group_names = [str(i) for i in range(X.shape[1])] if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"): group_names = X.columns X = X.values # in case there are any missing values in data impute them imp = SimpleImputer(missing_values=np.nan, strategy='mean') X = imp.fit_transform(X) kmeans = KMeans(n_clusters=k, random_state=0).fit(X) if round_values: for i in range(k): for j in range(X.shape[1]): xj = X[:, j].toarray().flatten() if issparse( X) else X[:, j] # sparse support courtesy of @PrimozGodec ind = np.argmin(np.abs(xj - kmeans.cluster_centers_[i, j])) kmeans.cluster_centers_[i, j] = X[ind, j] return DenseData(kmeans.cluster_centers_, group_names, None, 1.0 * np.bincount(kmeans.labels_))
def _cluster(self, embedding): logger.info('Computing cluster...') embedding = embedding.reset_index() n_molecules = embedding.shape[0] # Before reclustering remove all columns that may interfere embedding, prop_series = self._remove_non_numerics(embedding) with MetricsLogger('random_proj', n_molecules) as ml: srp = self.srp_embedding.fit_transform(embedding.values) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho ml.metric_func_args = (embedding, embedding, srp) with MetricsLogger('kmeans', n_molecules) as ml: kmeans_cuml = KMeans(n_clusters=self.n_clusters) kmeans_cuml.fit(srp) kmeans_labels = kmeans_cuml.predict(srp) ml.metric_name = 'silhouette_score' ml.metric_func = batched_silhouette_scores ml.metric_func_kwargs = {} ml.metric_func_args = (None, None) if self.context.is_benchmark: (srp_sample, kmeans_labels_sample), _ = self._random_sample_from_arrays( srp, kmeans_labels, n_samples=self.n_silhouette) ml.metric_func_args = (srp_sample, kmeans_labels_sample) # Add back the column required for plotting and to correlating data # between re-clustering srp = self.rand_jitter(srp) embedding['cluster'] = kmeans_labels embedding['x'] = srp[:, 0] embedding['y'] = srp[:, 1] # Add back the prop columns for col in prop_series.keys(): embedding[col] = prop_series[col] return embedding
def kmeans_sampling(X, k, round_values=True, detailed=False, random_state=0): """ Adapted from : https://github.com/slundberg/shap/blob/9411b68e8057a6c6f3621765b89b24d82bee13d4/shap/utils/_legacy.py Summarize a dataset (X) using weighted k-means. Parameters ---------- X : cuDF or Pandas DataFrame/Series, numpy arrays or cuda_array_interface compliant device array. Data to be summarized, shape (n_samples, n_features) k : int Number of means to use for approximation. round_values : bool; default=True For all i, round the ith dimension of each mean sample to match the nearest value from X[:,i]. This ensures discrete features always get a valid value. detailed: bool; default=False To return details of group names and cluster labels of all data points random_state: int; default=0 Sets the random state. Returns ------- summary : Summary of the data, shape (k, n_features) group_names : Names of the features labels : Cluster labels of the data points in the original dataset, shape (n_samples, 1) """ output_dtype = get_supported_input_type(X) _output_dtype_str = determine_array_type(X) cuml.internals.set_api_output_type(_output_dtype_str) if output_dtype is None: raise TypeError(f"Type of input {type(X)} is not supported. Supported \ dtypes: cuDF DataFrame, cuDF Series, cupy, numba,\ numpy, pandas DataFrame, pandas Series") if "DataFrame" in str(output_dtype): group_names = X.columns X = cp.array(X.values, copy=False) if "Series" in str(output_dtype): group_names = X.name X = cp.array(X.values.reshape(-1, 1), copy=False) else: # it's either numpy, cupy or numba X = cp.array(X, copy=False) try: # more than one column group_names = [str(i) for i in range(X.shape[1])] except IndexError: # one column X = X.reshape(-1, 1) group_names = ['0'] # in case there are any missing values in data impute them imp = SimpleImputer(missing_values=cp.nan, strategy='mean', output_type=_output_dtype_str) X = imp.fit_transform(X) kmeans = KMeans(n_clusters=k, random_state=random_state, output_type=_output_dtype_str).fit(X) if round_values: for i in range(k): for j in range(X.shape[1]): xj = X[:, j].toarray().flatten() if issparse( X) else X[:, j] # sparse support courtesy of @PrimozGodec ind = cp.argmin(cp.abs(xj - kmeans.cluster_centers_[i, j])) kmeans.cluster_centers_[i, j] = X[ind, j] summary = kmeans.cluster_centers_ labels = kmeans.labels_ if detailed: return summary, group_names, labels else: return summary
y = np.asarray( [[1.0, 2.0], [1.0, 4.0], [1.0, 0.0], [4.0, 2.0], [4.0, 4.0], [4.0, 0.0]], dtype=np.float32) x = np2cudf(y) q = np.asarray([[0, 0], [4, 4]], dtype=np.float32) p = np2cudf(q) a = np.asarray([[1.0, 1.0], [1.0, 2.0], [3.0, 2.0], [4.0, 3.0]], dtype=np.float32) b = np2cudf(a) print("input:") print(b) print("\nCalling fit") kmeans_float = KMeans(n_clusters=2, n_gpu=-1) kmeans_float.fit(b) print("labels:") print(kmeans_float.labels_) print("cluster_centers:") print(kmeans_float.cluster_centers_) ''' print("\nCalling Predict") print("labels:") print(kmeans_float.predict(p)) print("cluster_centers:") print(kmeans_float.cluster_centers_) ''' print("\nCalling fit_predict") kmeans_float2 = KMeans(n_clusters=2, n_gpu=-1)
smiles_list.append(fields[1].decode("utf-8")) count+=1 if count>max: break logger.info('Initializing Morgan fingerprints...') results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute() np_array_fingerprints = np.stack(results).astype(np.float32) # take np.array shape (n_mols, nBits) for GPU DataFrame gdf = np2cudf(np_array_fingerprints) # prepare one set of clusters n_clusters = 7 kmeans_float = KMeans(n_clusters=n_clusters) kmeans_float.fit(gdf) # UMAP umap = UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) Xt = umap.fit_transform(gdf) gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('cluster', kmeans_float.labels_) # start dash v = chemvisualize.ChemVisualization(