def phenograph_clustering(data: pd.DataFrame, features: list, verbose: bool, global_clustering: bool = False, print_performance_metrics: bool = True, **kwargs): """ Perform high-dimensional clustering of single cell data using the popular PhenoGraph algorithm (https://github.com/dpeerlab/PhenoGraph) Clustering is performed either on the entire dataframe (if global_clustering is True) or on each biological sample, in which case a column should be provided called 'sample_id' which this function will group on and perform clustering in turn. In both cases, the clustering labels are assigned to a new column named 'cluster_label'. Parameters ---------- data: Pandas.DataFrame features: list Columns to peform clustering on verbose: bool If True, provides a progress bar when global_clustering is False global_clustering: bool (default=False) Whether to cluster the whole dataframe or group on 'sample_id' and cluster groups print_performance_metrics: bool = True Print Calinski-Harabasz Index, Silhouette Coefficient, and Davies-Bouldin Index (see https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation) kwargs: Additional keyword arguments passed when calling phenograph.cluster Returns ------- Pandas.DataFrame, scipy.sparse.base.spmatrix, float Modified dataframe with clustering IDs assigned to the column 'cluster_label', sparse graph matrix, and modularity score for communities (Q) """ _print = vprint(verbose=verbose) data["cluster_label"] = None if global_clustering: communities, graph, q = phenograph.cluster(data[features], **kwargs) data["cluster_label"] = communities if print_performance_metrics: clustering_performance(data[features], data["cluster_label"].values) return data, graph, q graphs = dict() q = dict() for _id, df in data.groupby("sample_id"): _print(f"----- Clustering {_id} -----") communities, graph, q_ = phenograph.cluster(df[features], **kwargs) graphs[_id], q[_id] = graph, q_ df["cluster_label"] = communities data.loc[df.index, ["cluster_label"]] = df.cluster_label if print_performance_metrics: clustering_performance(df[features], df["cluster_label"].values) _print("-----------------------------") _print("\n") return data, graphs, q
def run_phenograph(self, k=30, directed=False, prune=False, min_cluster_size=10, jaccard=True, dis_metric='euclidean', n_jobs=-1, q_tol=1e-3, louvain_time_limit=2000, nn_method='kdtree'): communities, graph, Q = phenograph.cluster( self.data, k=k, directed=directed, prune=prune, min_cluster_size=min_cluster_size, jaccard=jaccard, primary_metric=dis_metric, n_jobs=n_jobs, q_tol=q_tol, louvain_time_limit=louvain_time_limit, nn_method=nn_method) communities = np.add(communities, 1) cluster_df = pd.DataFrame({'cluster': communities}, index=self.data.index) self.clusterinfo = ClusterInfo(cluster_df, graph, Q, 'phenograph') if self.parent is not None: self.parent.clusterinfo = ClusterInfo(cluster_df, graph, Q, 'phenograph') return communities, Q
def apply(self): communities, graph, Q = phenograph.cluster( data=self.matrix, k=self.n_neighbours, min_cluster_size=self.min_size, n_jobs=self.threads) # add 1 to the cluster labels to shift -1 values to zero. communities = communities + 1 self.results = communities arr = graph.toarray() arr_full = arr + arr.T np.fill_diagonal(arr_full, 1) dist = (arr_full - arr_full.max()) * (-1) np.fill_diagonal(dist, 0) self.distance_matrix = dist self.modularity = Q set_c = set(communities) logging.debug('set of communities found by phenograph: ' + str(set_c)) # make sure at least 2 communities are found try: assert (len(set_c) > 1) except AssertionError as error: # Output expected AssertionErrors. logging.debug('Less than 2 communities is found') sys.exit(1)
def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs): """ Runs Phenograph on an expression- or PCA-based distance matrix. Parameters ---------- distance: ndarray cell x cell distance matrix k: int (default 20) number of nearest neighbors to use outdir: str (default '') prefix: str (default '') label: str (default '') Returns ------- communities graph Q : float """ knn = get_knn(distance, k) communities, graph, Q = phenograph.cluster(knn, **kwargs) if outdir is not None and len(outdir)>0: fileprefix = '{}/{}'.format(outdir, prefix) clusterfile = fileprefix + '.pg.txt' np.savetxt(clusterfile, communities, fmt='%i') logfile = fileprefix + '.pg.info.txt' with open(logfile, 'w') as f: f.write('k:{}\nQ:{}'.format(k, Q)) return communities, graph, Q
def runclustering(self, markertoexclude, adata): """ Function for execution of phenograph analysis :param markertoexclude: :param adata: :return: """ marker = adata.var_names.to_list() markertoinclude = [i for i in marker if i not in markertoexclude] data = adata[:, markertoinclude].to_df() self.log.info("Markers used for Phenograph clustering:") self.log.info(data.columns) if (self.scale == True): min_max_scaler = preprocessing.MinMaxScaler((1, 100)) x_scaled = min_max_scaler.fit_transform(data.values) data = pd.DataFrame(x_scaled, columns=data.columns) self.new_head = [] self.new_head.append([column.split("::")[-1] for column in data]) data.columns = self.new_head data.diff().hist(color="k", alpha=0.5, bins=50, grid=False, xlabelsize=8, ylabelsize=8) plt.tight_layout() plt.savefig("/".join([ self.output_folder, ".".join(["_".join([self.analysis_name]), "pdf"]) ])) communities, graph, Q = pg.cluster(data.values, k=int(self.k_coef), directed=False, prune=False, min_cluster_size=1, n_jobs=int(self.thread)) # create dataframe with Phenograph output self.dfPheno = pd.DataFrame(communities) # shift of one unit the name of cluster self.dfPheno["Phenograph"] = self.dfPheno[0] + 1 # remove first column self.dfPheno = self.dfPheno.drop(columns=[0], axis=1) self.dfPheno.set_index(adata.obs.index, inplace=True) adata.obs['cluster'] = self.dfPheno adata.obs['Phenograph_cluster'] = self.dfPheno reducer = umap.UMAP(random_state=42, n_neighbors=10, min_dist=0.001) embedding = reducer.fit_transform(data.values) adata.obsm['X_umap'] = embedding self.tmp_df = self.tmp_df.astype(int) self.tmp_df['UMAP_1'] = embedding[:, 0] self.tmp_df['UMAP_2'] = embedding[:, 1] self.tmp_df['Cluster_Phenograph'] = self.dfPheno self.tmp_df.to_csv("/".join([ self.output_folder, ".".join(["_".join([self.analysis_name]), "csv"]) ]), header=True, index=False) return adata
def run_phenograph(data): print(">>> Running PhenoGraph") tic = time.time() communities, _, _ = phenograph.cluster(data) toc = time.time() print(" PhenoGraph found {} clusters".format(len(np.unique(communities)))) print(" PhenoGraph took {:.2f} s".format(toc - tic)) return communities
def adjusted_rand_score_vector(normalized_matrices): PCA_model = PCA(n_components=1000, svd_solver='randomized') PC_column_names = ['PC' + str(i) for i in list(range(1, 1001))] components_normed_data_full = pd.DataFrame(data=PCA_model.fit_transform( normalized_matrices[1]), columns=PC_column_names) full_communities, full_graph, full_Q = phenograph.cluster( components_normed_data_full) adj_rand_scores = [] for split in list(np.array(range(1, 10)) / 10): components_normed_data_downsample = pd.DataFrame( data=PCA_model.fit_transform(normalized_matrices[split]), columns=PC_column_names) downsample_communities, downsample_graph, downsample_Q = phenograph.cluster( components_normed_data_downsample) adj_rand_scores.append( adjusted_rand_score(full_communities, downsample_communities)) return adj_rand_scores
def determine_cell_clusters(data, k=50): """Run phenograph for clustering cells :param data: Principal components of the data. :param k: Number of neighbors for kNN graph construction :return: Clusters """ # Cluster and cluster centrolds communities, _, _ = phenograph.cluster(data, k=k) communities = pd.Series(communities, index=data.index) return communities
def phenograph_metaclustering(data: pd.DataFrame, features: list, verbose: bool = True, summary_method: str = "median", scale_method: str or None = None, scale_kwargs: dict or None = None, print_performance_metrics: bool = True, **kwargs): """ Meta-clustering with a the PhenoGraph algorithm. This function will summarise the clusters in 'data' (where cluster IDs should be contained in a column named 'cluster_label') and then 'cluster the clusters' using the PhenoGraph. Parameters ---------- data: Pandas.DataFrame Clustered data with columns for sample_id and cluster_label features: list Columns clustering is performed on summary_method: str (default="median") How to summarise the clusters for meta-clustering print_performance_metrics: bool = True Print Calinski-Harabasz Index, Silhouette Coefficient, and Davies-Bouldin Index (see https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation) verbose: bool (default=True) Whether to provide feedback to stdout scale_method: str, optional Perform scaling of centroids; see cytopy.transform.Scaler scale_kwargs: dict, optional Additional keyword arguments passed to Scaler kwargs: Keyword arguments passed to phenograph.cluster Returns ------- Pandas.DataFrame Updated dataframe with a new column named 'meta_label' with the meta-clustering associations """ vprint_ = vprint(verbose) vprint_("----- Phenograph meta-clustering ------") metadata = summarise_clusters(data, features, scale_method, scale_kwargs, summary_method) vprint_("...summarising clusters") vprint_("...clustering the clusters") communities, graph, q = phenograph.cluster(metadata[features].values, **kwargs) metadata["meta_label"] = communities if print_performance_metrics: clustering_performance(metadata[features], metadata["meta_label"].values) vprint_("...assigning meta-labels") data = _assign_metalabels(data, metadata) vprint_("------ Complete ------") return data, graph, q
def _one_fit(self): print("\nCreating downsampled doublets...") self._createDoublets() # Normalize combined augmented set print("Normalizing...") aug_counts = self.normalizer( np.append(self._raw_counts, self._raw_synthetics, axis=0)) self._norm_counts = aug_counts[:self._num_cells] self._synthetics = aug_counts[self._num_cells:] print("Running PCA...") # Get phenograph results pca = PCA(n_components=self.n_components) print("Clustering augmented data set with Phenograph...\n") reduced_counts = pca.fit_transform(aug_counts) fullcommunities, _, _ = phenograph.cluster( reduced_counts, **self.phenograph_parameters) min_ID = min(fullcommunities) self.communities_ = fullcommunities[:self._num_cells] self.synth_communities_ = fullcommunities[self._num_cells:] community_sizes = [ np.count_nonzero(fullcommunities == i) for i in np.unique(fullcommunities) ] print("Found communities [{0}, ... {2}], with sizes: {1}\n".format( min(fullcommunities), community_sizes, max(fullcommunities))) # Count number of fake doublets in each community and assign score # Number of synth/orig cells in each cluster. synth_cells_per_comm = collections.Counter(self.synth_communities_) orig_cells_per_comm = collections.Counter(self.communities_) community_IDs = orig_cells_per_comm.keys() community_scores = { i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } scores = np.array([community_scores[i] for i in self.communities_]) community_p_values = { i: hypergeom.cdf(synth_cells_per_comm[i], aug_counts.shape[0], self._synthetics.shape[0], synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } p_values = np.array([community_p_values[i] for i in self.communities_]) if min_ID < 0: scores[self.communities_ == -1] = np.nan p_values[self.communities_ == -1] = np.nan return scores, p_values
def phenograph_cluster(data, f_save=None, max_samps=None): import phenograph if max_samps is not None and data.shape[0] > max_samps: print("Subsampling") #full_data = data.copy() data = data[np.random.choice(data.shape[0], max_samps, replace=False), :] communities, graph, Q = phenograph.cluster(data, k=100) print(communities) if f_save is not None: f_save = f_save.replace(".p", "") + ".p" pickle.dump(communities, open(f_save, "wb")) return communities
def condense_segmented_clusters(self, segmented_data, min_cluster_size=1): # Cluster the segmented counts n_cells = segmented_data.shape[0] n_regions = segmented_data.shape[1] n_neighbours = max(int(n_cells / 10), 2) # avoid errors print(f"n_neighbours to be used: {str(n_neighbours)}") if min_cluster_size < 1 and min_cluster_size > 0: min_cluster_size = min_cluster_size * n_cells min_cluster_size = int(min_cluster_size) print(f"Setting min_cluster_size to {min_cluster_size}") # Cluster the normalised segmented data normalised_segmented_data = segmented_data / np.sum( segmented_data, axis=1)[:, np.newaxis] communities, graph, Q = phenograph.cluster( data=normalised_segmented_data, k=n_neighbours, n_jobs=1, jaccard=True, min_cluster_size=min_cluster_size) communities_df = pd.DataFrame(communities, columns=["cluster"]) communities_df["cell_barcode"] = communities_df.index communities_df = communities_df[["cell_barcode", "cluster"]] community_dict = dict((Counter(communities))) community_ids = sorted(list(community_dict)) # Compute (unnormalised) average counts of each cluster avg_segmented_counts = np.empty(segmented_data.shape) condensed_avg_segmented_counts = np.empty( (len(community_ids), n_regions)) cluster_sizes = np.zeros((len(community_ids), )) # Offset -1 if there is one if np.min(community_ids) == -1: communities = np.array(communities) + 1 community_ids = np.array(community_ids) + 1 for id in community_ids: # Use robust mean? avg_segmented_counts[np.where(communities == id)[0]] = np.mean( segmented_data[np.where(communities == id)[0], :], axis=0) condensed_avg_segmented_counts[id] = avg_segmented_counts[np.where( communities == id)[0][0], :] cluster_sizes[id] = np.where(communities == id)[0].shape[0] print(f"Found {len(community_ids)} clusters.") print(f"Cluster sizes: {cluster_sizes}") self.cluster_assignments = communities return condensed_avg_segmented_counts, cluster_sizes, communities, Q
def phenograph_metaclustering(data: pd.DataFrame, features: list, verbose: bool = True, summary_method: callable = np.median, norm_method: str or None = "norm", norm_kwargs: dict or None = None, **kwargs): """ Meta-clustering with a the PhenoGraph algorithm. This function will summarise the clusters in 'data' (where cluster IDs should be contained in a column named 'cluster_id') and then 'cluster the clusters' using the PhenoGraph. Parameters ---------- data: Pandas.DataFrame Clustered data with columns for sample_id and cluster_id features: list Columns clustering is performed on summary_method: callable Function to apply to each sample_id/cluster_id group to summarise the clusters for meta-clustering norm_method: str or None If provided, method used to normalise data prior to summarising norm_kwargs: dict, optional Additional keyword arguments passed to CytoPy.flow.transform.scaler verbose: bool (default=True) Whether to provide feedback to stdout kwargs: Keyword arguments passed to phenograph.cluster Returns ------- Pandas.DataFrame Updated dataframe with a new column named 'meta_label' with the meta-clustering associations """ vprint_ = vprint(verbose) vprint_("----- Phenograph meta-clustering ------") norm_kwargs = norm_kwargs or {} metadata = _meta_preprocess(data, features, summary_method, norm_method, **norm_kwargs) vprint_("...summarising clusters") vprint_("...clustering the clusters") communities, graph, q = phenograph.cluster(metadata[features].values, **kwargs) metadata["meta_label"] = communities vprint_("...assigning meta-labels") data = _asign_metalabels(data, metadata) vprint_("------ Complete ------") return data, graph, q
def cluster_gene_trends(trends, k=150, n_jobs=-1): """Function to cluster gene trends :param trends: Matrix of gene expression trends :param k: K for nearest neighbor construction :param n_jobs: Number of jobs for parallel processing :return: Clustering of gene trends """ # Standardize the trends trends = pd.DataFrame(StandardScaler().fit_transform(trends.T).T, index=trends.index, columns=trends.columns) # Cluster clusters, _, _ = phenograph.cluster(trends, k=k, n_jobs=n_jobs) clusters = pd.Series(clusters, index=trends.index) return clusters
def run_phenograph_approx_knn(X, k=20, outdir='', prefix='', **kwargs): """ Runs Phenograph on an expression- or PCA-based distance matrix. Parameters ---------- X: ndarray cell x feature data matrix k: int (default 20) number of nearest neighbors to use outdir: str (default '') prefix: str (default '') label: str (default '') Returns ------- communities graph Q : float """ assert X.shape[0] > X.shape[1] fileprefix = '{}/{}'.format(outdir, prefix) knn_file = f'{fileprefix}.knn{k}_approx.mtx' if os.path.exists(knn_file): knn = mmread(knn_file) else: knn = get_approx_knn(X, k) #.tolil() mmwrite(knn_file, knn) print(83, knn.shape) communities, graph, Q = phenograph.cluster(knn, **kwargs) if outdir is not None and len(outdir) > 0: fileprefix = '{}/{}'.format(outdir, prefix) clusterfile = fileprefix + '.pg.txt' np.savetxt(clusterfile, communities, fmt='%i') logfile = fileprefix + '.pg.info.txt' with open(logfile, 'w') as f: f.write('k:{}\nQ:{}'.format(k, Q)) return communities, graph, Q
def plot_phenograph(dataset='big clusters', primary_metric='euclidean', lowrank=False, k=30, min_cluster_size=10): key = dataset + ' lowrank' if lowrank else dataset corr = correls[key] if dataset == 'big clusters': metadata = big_clusters_cells palette = 'Set2' cluster_col = 'cluster_id' elif dataset == 'amacrine': metadata = amacrine_cells palette = 'husl' cluster_col = 'cluster_id' community_col = 'community' communities, graph, Q = phenograph.cluster( corr, k=k, primary_metric=primary_metric, min_cluster_size=min_cluster_size) network = networkx.from_scipy_sparse_matrix(graph) positions = networkx.spring_layout(network) nodes_source = ColumnDataSource(get_nodes_specs( positions, metadata, corr.index, communities, other_cluster_col=cluster_col, community_col=community_col, palette=palette)) edges_source = ColumnDataSource(get_edges_specs(network, positions)) # --- First tab: KNN clustering --- # tab1 = plot_graph(nodes_source, edges_source, legend_col=community_col, color_col=f'{community_col}_color', tab=True, title='KNN Clustering') # --- Second tab: Clusters from paper --- # tab2 = plot_graph(nodes_source, edges_source, legend_col='cluster_n_celltype', tab=True, color_col='other_cluster_color', title="Clusters from paper") tabs = Tabs(tabs=[tab1, tab2]) show(tabs)
def pg_cluster(file_name, k=30, min_cluster_size=10): ''' Run PhenoGraph clustering :param file_name: file base name :param k: kNN's K :param min_cluster_size: minimal number of grouped points to form cluster ''' file_path_single_normalized = build_file_path(file_name, suffix='normalized') file_path_single_labeled = build_file_path(file_name, suffix='labeled') file_path_single_cluster = build_file_path(file_name, suffix=COL_NAME_CLUSTER) markers = read_markers() data = load_csv(file_path_single_normalized).filter( items=markers).as_matrix() assert data.shape[1] == len(markers) print('\tCluster {} points in {}'.format(data.shape[0], file_name), flush=True) communities, graph, Q = cluster(data, k=k, nn_method='kdtree', min_cluster_size=min_cluster_size) data, graph = None, None print('Found {} clusters'.format(len(unique(communities))), flush=True) frame = load_csv(file_path_single_normalized) frame[COL_NAME_CLUSTER] = communities save_csv(frame, file_path_single_labeled) # medians & counts cluster_frame = frame.groupby(COL_NAME_CLUSTER, as_index=False).median() cluster_frame = cluster_frame[ cluster_frame[COL_NAME_CLUSTER] != -1] # skip -1 which means under min cluster size cluster_frame[COL_NAME_COUNT_CELL] = frame.groupby( COL_NAME_CLUSTER)[COL_NAME_CLUSTER].count() save_csv(cluster_frame, file_path_single_cluster) print('Clustering successful', flush=True)
def tsne(raw_counts, labels, n_components=30, n_jobs=-1, show=False, save=None): """Produce a tsne plot of the data with doublets in black Args: raw_counts (ndarray): cells by genes count matrix labels (ndarray): predicted doublets from predict method n_components (int, optional): number of PCs to use prior to TSNE n_jobs (int, optional): number of cores to use for TSNE, -1 for all show (bool, optional): If True, runs plt.show() save (str, optional): filename for saved figure, figure not saved by default Returns: matplotlib figure ndarray: tsne reduction """ norm_counts = normalize_counts(raw_counts) reduced_counts = PCA(n_components=n_components, svd_solver='randomized').fit_transform(norm_counts) communities, _, _ = phenograph.cluster(reduced_counts) tsne_counts = TSNE(n_jobs=-1).fit_transform(reduced_counts) fig, axes = plt.subplots(1, 1, figsize=(3, 3), dpi=200) axes.scatter(tsne_counts[:, 0], tsne_counts[:, 1], c=communities, cmap=plt.cm.tab20, s=1) axes.scatter(tsne_counts[:, 0][labels], tsne_counts[:, 1] [labels], s=3, edgecolor='k', facecolor='k') axes.set_title('Cells with Detected\n Doublets in Black') plt.xticks([]) plt.yticks([]) axes.set_xlabel('{} doublets out of {} cells.\n {}% across-type doublet rate.'.format( np.sum(labels), raw_counts.shape[0], np.round(100 * np.sum(labels) / raw_counts.shape[0], 2))) if show is True: plt.show() if isinstance(save, str): fig.savefig(save, format='pdf', bbox_inches='tight') return fig, tsne_counts
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, epilog='author: {0} mail: {1}'.format(__author__, __mail__)) parser.add_argument('-s', '--seq', help='fasta file', dest='seq', required=True) parser.add_argument('-s', '--seq', help='fasta file', dest='seq', default=None) parser.add_argument('-s', '--seq', help='fasta file', action='store_true') args = parser.parse_args() logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s" ) file = open( "/asnas/wangqf_group/suyx/Project_scRNA_seq/Analysis/PublicData/cell/GSE116256/suppl/GSM3587923_AML1012-D0.dem.txt" ) data = file.readlines() file.close() temp = data[:] cellID = data[0].strip().split("\t")[1:] gene = [i.strip().split("\t")[0] for i in data[1:]] counts = [i.strip().split("\t")[1:] for i in data[1:]] counts = [[int(j) for j in i] for i in counts] counts_np = numpy.array(counts) counts_np = counts_np.T communities, graph, Q = phenograph.cluster(counts_np)
PATH_M = 'output/CD/Euclidean/' FILENAMES = sorted(listdir(PATH_M)) k_ = 30 ari = np.zeros((14)) v = np.zeros((14)) n_clusters = np.zeros((14)) index = [] for i in np.arange(1, 15): df = pd.read_table(PATH_M + str(i) + '.' + str(i), sep=',', index_col=None, header=None) features = list(df.columns)[:-1] target = df.iloc[:, -1] communities, graph, Q = phenograph.cluster(df.loc[:, features], k=k_, primary_metric='Euclidean') ari[i - 1] = adjusted_rand_score(target, communities) v[i - 1] = v_measure_score(target, communities) n_clusters[i - 1] = len(np.unique(communities)) print(i, v[i - 1], n_clusters[i - 1]) ari = pd.DataFrame(ari, index=np.arange(1, 15)) v = pd.DataFrame(v, index=np.arange(1, 15)) n_clusters = pd.DataFrame(n_clusters, index=np.arange(1, 15)) ari.to_csv('output/CD/ARI_k=' + str(k_) + '_E_rerun.csv') v.to_csv('output/CD/V_k=' + str(k_) + '_E_rerun.csv') n_clusters.to_csv('output/CD/N_clusters_k=' + str(k_) + '_E_rerun.csv')
import argparse import pandas as pd import phenograph # This script runs Phenograph # Get input arguments parser = argparse.ArgumentParser() parser.add_argument('--data', help='Infile (rows = cells, columns = features)') parser.add_argument('--out', help='Outfile (clusters)') parser.add_argument('-k', help='Number of neighbors for kNN graph', type=int, default=50) parser.add_argument('--metric', help='Distance metric to use', choices=['manhattan', 'euclidean', 'cosine', 'correlation'], default='cosine') parser.add_argument('--ncores', help='Number of cores to use', type=int, default=-1) args = parser.parse_args() # Run phenograph data = pd.read_table(args.data) communities, graph, Q = phenograph.cluster(data, k=args.k, primary_metric=args.metric, n_jobs=args.ncores) # Write output out = open(args.out, 'w') for xi in communities: out.write('%s\n' %(xi)) out.close()
normalise = args.normalise n_neighbours = args.n_neighbours input_data = np.loadtxt(input_data_file, delimiter=',') # Cluster the segmented counts N, P = input_data.shape K = max(int(N / 10), 2) # avoid errors if n_neighbours: K = int(n_neighbours) print(f"n_neighbours to be used: {str(K)}") # Cluster the normalised segmented data if normalise: print("Will cluster normalised data.") input_data = input_data / np.sum(input_data, axis=1)[:, np.newaxis] * P communities, graph, Q = phenograph.cluster(data=input_data, k=K, n_jobs=1, jaccard=True, min_cluster_size=1, seed=42) print(f"Found {len(np.unique(communities))} clusters.") input_data_file = os.path.splitext(input_data_file)[0] out_file = input_data_file + '_cluster_assignments.txt' np.savetxt(out_file, communities, delimiter=",") print(f"Saved the cluster assignments into {out_file}.")
def scalable_cluster(latent_code, kmeans_num=500, cluster_num=400, display_step=50, phenograh_neighbor=30 ): ''' Scalable cluster: To perform graph clustering on large-scale data, we designed a scalable clustering strategy by combining k-means and PhenoGraph. Briefly, we divide cells into M (kmeans_num) groups of equal size and perform K-means (cluster_num) clustering on each group independently. The whole dataset is split to M×K clusters and we only input the cluster centroids into PhenoGraph for graph clustering. Finally, each cell is assigned to graph clusters according to the cluster labels of its nearest centroids. Parameters: latent_code: n*m matrix; n = number of cells, m = dimension of feature representation. kmeans_num: number of independent K-means clusterings used. This is also the subset number. cluster_num: cluster number for each K-means clustering. This is also the "n_clusters" in KMeans function in sklearn package. display_step: displaying the process of K-means clustering. phenograh_neighbor: "k" parameter in PhenoGraph package. Output: Cluster labels for input cells. Altschuler & Wu Lab 2018. Software provided as is under Apache License 2.0. ''' print('Scalable clustering:') print('Use %d subsets of cells for initially clustering...' % kmeans_num) stamp = np.floor(np.linspace(0, latent_code.shape[0], kmeans_num + 1)) stamp = stamp.astype(int) cluster_ceter = np.zeros([kmeans_num * cluster_num, latent_code.shape[1]]) mapping_sample_kmeans = np.zeros(latent_code.shape[0]) for i in range(kmeans_num): low_bound = stamp[i] upp_bound = stamp[i + 1] sample_range = np.arange(low_bound, upp_bound) select_sample = latent_code[sample_range, :] kmeans = KMeans(n_clusters=cluster_num, random_state=0).fit(select_sample) label = kmeans.labels_ for j in range(cluster_num): cluster_sample_idx = np.nonzero(label == j)[0] cluster_sample = select_sample[cluster_sample_idx, :] cluster_ceter[i * cluster_num + j, :] = np.mean(cluster_sample, axis=0) mapping_sample_kmeans[sample_range[cluster_sample_idx] ] = i * cluster_num + j if i % display_step == 0: print('\tK-means clustering for %d subset.' % i) print('Finish intially clustering by K-means.') print('Start PhenoGraph clustering...\n') label_pheno, graph, Q = phenograph.cluster( cluster_ceter, k=phenograh_neighbor, n_jobs=1) label = np.zeros(latent_code.shape[0]) for i in range(label_pheno.max() + 1): center_index = np.nonzero(label_pheno == i)[0] for j in center_index: sample_index = np.nonzero(mapping_sample_kmeans == j)[ 0] # samples belong to this center label[sample_index] = i print('Finish density down-sampling clustering.') return label
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import LogisticRegression from sklearn.metrics import v_measure_score, accuracy_score from sklearn.model_selection import LeaveOneOut from statsmodels.stats.multitest import multipletests patient_id = [1, 9, 14, 19, 20, 22][0] df_E = pd.read_csv('Output_TDMLMJ/Euclidean/Euclidean_' + str(patient_id), index_col=None, header=None) labels = pd.read_csv('Labels_test_CD_CTRL_HMIS2.csv', index_col=0, header=0) for i in np.arange(0, 28): labels.ix[np.int(i * 10000):np.int((i + 1) * 10000), 'Sample ID'] = i + 1 features_E = list(df_E.columns[:-1]) communities_E, graph_E, Q_E = phenograph.cluster(df_E.loc[:, features_E], k=30) labels['PhenoGraph_E'] = communities_E ct_E = pd.crosstab(labels.loc[:, 'Sample ID'], labels.loc[:, 'PhenoGraph_E'], normalize=True) wrs_E = np.zeros(len(ct_E.columns)) p_wrs_E = np.zeros(len(ct_E.columns)) for cluster in ct_E.columns: wrs_E[cluster], p_wrs_E[cluster] = stats.mannwhitneyu( ct_E.iloc[0:14, cluster], ct_E.iloc[14:, cluster]) q_wrs_E = multipletests(p_wrs_E, method='fdr_bh')[1] results_E = pd.DataFrame({'WRS': wrs_E, 'P': p_wrs_E, 'Q': q_wrs_E})
print(term_df.head()) gc.collect() # Vectorize vectorizer = TfidfVectorizer(stop_words="english", strip_accents="ascii", max_features=2**12) X = vectorizer.fit_transform(tqdm(term_df["body_text"].values)) gc.collect() print(X.shape) # In[5]: #%% # Louvain clustering of text communities, graph, Q = phenograph.cluster(X, k=100) num_lclusters = len(set(communities)) print("Louvain clustering classified the papers into {} clusters".format( num_lclusters)) #K-means clustering of text k = num_lclusters # equate the number of clusters for both for comparison kmeans = MiniBatchKMeans(n_clusters=k) y_pred = kmeans.fit_predict(X) num_kclusters = len(set(y_pred)) print("K-means clustering classified the papers into {} clusters".format( num_kclusters)) # In[6]: # Dimensionality reduction with UMAP
scdata = magic.mg.SCData.from_csv( "/Users/vincentliu/Desktop/Pe'er Lab/Summer 2017/Data/pbmc_4k_short.csv") # log transform the data with default pseudocount 0.1 print("log-transforming the data...") scdata.log_transform_scseq_data() """ # run tsne on the processed data and store the tsne values into a pd dataframe print("running tSNE on the data...") scdata.run_tsne() tsne = scdata.tsne""" # run phenograph on the processed data print("starting PhenoGraph...") processed = scdata.data communities, graph, Q = phenograph.cluster(processed, k=15) print(len(communities)) communities = ['0' if x == 3 else '1' for x in communities] print(len(communities)) """ toPlot = tsne.assign(com=pd.Series(communities).values) clusterRec = {} for index, row in toPlot.iterrows(): if row['com'] in clusterRec: count = clusterRec[row['com']][2] new1 = (clusterRec[row['com']][0] * count + row['tSNE1']) / (count+1) new2 = (clusterRec[row['com']][1] * count + row['tSNE2']) / (count+1) clusterRec[row['com']] = [new1, new2, count+1] else: clfusterRec[row['com']] = [row['tSNE1'], row['tSNE2'], 1]
#!/usr/bin/python import phenograph import numpy ## Read data. data = numpy.loadtxt( 'intersect_all_first_1m.tsv.gz') #, dtype=<class 'integer'>) ## Run phenograph communities, graph, Q = phenograph.cluster(data) ## Save. numpy.savetxt('communities.txt.gz', communities, fmt='%.d') #numpy.savetxt('graph.txt.gz', graph)
def muse_fit_predict(data_x, data_y, label_x, label_y, latent_dim=100, n_epochs=500, lambda_regul=5, lambda_super=5): """ MUSE model fitting and predicting: This function is used to train the MUSE model on multi-modality data Parameters: data_x: input for transcript modality; matrix of n * p, where n = number of cells, p = number of genes. data_y: input for morphological modality; matrix of n * q, where n = number of cells, q is the feature dimension. label_x: initial reference cluster label for transcriptional modality. label_y: inital reference cluster label for morphological modality. latent_dim: feature dimension of joint latent representation. n_epochs: maximal epoch used in training. lambda_regul: weight for regularization term in the loss function. lambda_super: weight for supervised learning loss in the loss function. Output: latent: joint latent representation learned by MUSE. reconstruct_x:reconstructed feature matrix corresponding to input data_x. reconstruct_y:reconstructed feature matrix corresponding to input data_y. latent_x: modality-specific latent representation corresponding to data_x. latent_y: modality-specific latent representation corresponding to data_y. Feng Bao @ Altschuler & Wu Lab @ UCSF 2022. Software provided as is under MIT License. """ """ initial parameter setting """ # parameter setting for neural network n_hidden = 128 # number of hidden node in neural network learn_rate = 1e-4 # learning rate in the optimization batch_size = 64 # number of cells in the training batch n_epochs_init = 200 # number of training epoch in model initialization print_epochs = 50 # epoch interval to display the current training loss cluster_update_epoch = 200 # epoch interval to update modality-specific clusters # read data-specific parameters from inputs feature_dim_x = data_x.shape[1] feature_dim_y = data_y.shape[1] n_sample = data_x.shape[0] # GPU configuration # config = tf.ConfigProto() # config.gpu_options.allow_growth = True """ construct computation graph using TensorFlow """ tf.reset_default_graph() # raw data from two modalities x = tf.placeholder(tf.float32, shape=[None, feature_dim_x], name='input_x') y = tf.placeholder(tf.float32, shape=[None, feature_dim_y], name='input_y') # labels inputted for references ref_label_x = tf.placeholder(tf.float32, shape=[None], name='ref_label_x') ref_label_y = tf.placeholder(tf.float32, shape=[None], name='ref_label_y') # hyperparameter in triplet loss triplet_lambda = tf.placeholder(tf.float32, name='triplet_lambda') triplet_margin = tf.placeholder(tf.float32, name='triplet_margin') # network architecture z, x_hat, y_hat, encode_x, encode_y, loss, \ reconstruction_error, weight_penalty, \ trip_loss_x, trip_loss_y = structured_embedding(x, y, ref_label_x, ref_label_y, latent_dim, triplet_margin, n_hidden, lambda_regul, triplet_lambda) # optimization operator train_op = tf.train.AdamOptimizer(learn_rate).minimize(loss) print('++++++++++ MUSE for multi-modality single-cell analysis ++++++++++') """ MUSE optimization """ total_batch = int(n_sample / batch_size) with tf.Session() as sess: """ initialization of autoencoder architecture for MUSE """ print('MUSE initialization') # global parameter initialization sess.run(tf.global_variables_initializer(), feed_dict={triplet_lambda: 0, triplet_margin: 0}) for epoch in range(n_epochs_init): # randomly permute samples random_idx = np.random.permutation(n_sample) data_train_x = data_x[random_idx, :] data_train_y = data_y[random_idx, :] for i in range(total_batch): # input data batches offset = (i * batch_size) % (n_sample) batch_x_input = data_train_x[offset:(offset + batch_size), :] batch_y_input = data_train_y[offset:(offset + batch_size), :] # initialize parameters without self-supervised loss (triplet_lambda=0) sess.run(train_op, feed_dict={x: batch_x_input, y: batch_y_input, ref_label_x: np.zeros(batch_x_input.shape[0]), ref_label_y: np.zeros(batch_y_input.shape[0]), triplet_lambda: 0, triplet_margin: 0}) # calculate and print loss terms for current epoch if epoch % print_epochs == 0: L_total, L_reconstruction, L_weight = \ sess.run((loss, reconstruction_error, weight_penalty), feed_dict={x: data_train_x, y: data_train_y, ref_label_x: np.zeros(data_train_x.shape[0]), # no use as triplet_lambda=0 ref_label_y: np.zeros(data_train_y.shape[0]), # no use as triplet_lambda=0 triplet_lambda: 0, triplet_margin: 0}) print( "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f" % (epoch, L_total, L_reconstruction, L_weight)) # estimate the margin for the triplet loss latent, reconstruct_x, reconstruct_y = \ sess.run((z, x_hat, y_hat), feed_dict={x: data_x, y: data_y, ref_label_x: np.zeros(data_x.shape[0]), ref_label_y: np.zeros(data_y.shape[0]), triplet_lambda: 0, triplet_margin: 0}) latent_pd_matrix = pdist(latent, 'euclidean') latent_pd_sort = np.sort(latent_pd_matrix) select_top_n = np.int(latent_pd_sort.size * 0.2) margin_estimate = np.median(latent_pd_sort[-select_top_n:]) - np.median(latent_pd_sort[:select_top_n]) # refine MUSE parameters with reference labels and triplet losses for epoch in range(n_epochs_init): # randomly permute samples random_idx = np.random.permutation(n_sample) data_train_x = data_x[random_idx, :] data_train_y = data_y[random_idx, :] label_train_x = label_x[random_idx] label_train_y = label_y[random_idx] for i in range(total_batch): # data batches offset = (i * batch_size) % (n_sample) batch_x_input = data_train_x[offset:(offset + batch_size), :] batch_y_input = data_train_y[offset:(offset + batch_size), :] label_x_input = label_train_x[offset:(offset + batch_size)] label_y_input = label_train_y[offset:(offset + batch_size)] # refine parameters sess.run(train_op, feed_dict={x: batch_x_input, y: batch_y_input, ref_label_x: label_x_input, ref_label_y: label_y_input, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) # calculate loss on all input data for current epoch if epoch % print_epochs == 0: L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y = \ sess.run((loss, reconstruction_error, weight_penalty, trip_loss_x, trip_loss_y), feed_dict={x: data_train_x, y: data_train_y, ref_label_x: label_train_x, ref_label_y: label_train_y, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) print( "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f,\t x triplet: %03.5f,\t y triplet: %03.5f" % (epoch, L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y)) # update cluster labels based modality-specific latents latent_x, latent_y = \ sess.run((encode_x, encode_y), feed_dict={x: data_x, y: data_y, ref_label_x: label_x, ref_label_y: label_y, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) # update cluster labels using PhenoGraph label_x_update, _, _ = phenograph.cluster(latent_x) label_y_update, _, _ = phenograph.cluster(latent_y) print('Finish initialization of MUSE') ''' Training of MUSE ''' for epoch in range(n_epochs): # randomly permute samples random_idx = np.random.permutation(n_sample) data_train_x = data_x[random_idx, :] data_train_y = data_y[random_idx, :] label_train_x = label_x_update[random_idx] label_train_y = label_y_update[random_idx] # loop over all batches for i in range(total_batch): # batch data offset = (i * batch_size) % (n_sample) batch_x_input = data_train_x[offset:(offset + batch_size), :] batch_y_input = data_train_y[offset:(offset + batch_size), :] batch_label_x_input = label_train_x[offset:(offset + batch_size)] batch_label_y_input = label_train_y[offset:(offset + batch_size)] sess.run(train_op, feed_dict={x: batch_x_input, y: batch_y_input, ref_label_x: batch_label_x_input, ref_label_y: batch_label_y_input, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) # calculate and print losses on whole training dataset if epoch % print_epochs == 0: L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y = \ sess.run((loss, reconstruction_error, weight_penalty, trip_loss_x, trip_loss_y), feed_dict={x: data_train_x, y: data_train_y, ref_label_x: label_train_x, ref_label_y: label_train_y, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) # print cost every epoch print( "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f,\t x triplet loss: %03.5f,\t y triplet loss: %03.5f" % (epoch, L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y)) # update cluster labels based on new modality-specific latent representations if epoch % cluster_update_epoch == 0: latent_x, latent_y = \ sess.run((encode_x, encode_y), feed_dict={x: data_x, y: data_y, ref_label_x: label_x, ref_label_y: label_y, triplet_lambda: lambda_super, triplet_margin: margin_estimate}) # use PhenoGraph to obtain cluster label label_x_update, _, _ = phenograph.cluster(latent_x) label_y_update, _, _ = phenograph.cluster(latent_y) """ MUSE output """ latent, reconstruct_x, reconstruct_y, latent_x, latent_y = \ sess.run((z, x_hat, y_hat, encode_x, encode_y), feed_dict={x: data_x, y: data_y, ref_label_x: label_x, # no effects to representations ref_label_y: label_y, # no effects to representations triplet_lambda: lambda_super, triplet_margin: margin_estimate}) print('++++++++++ MUSE completed ++++++++++') return latent, reconstruct_x, reconstruct_y, latent_x, latent_y
q = np.zeros((1, n_randstart)) t = np.zeros((1, n_randstart)) # nmi = np.zeros((1,n_randstart)) # ari = np.zeros((1,n_randstart)) # fm = np.zeros((1,n_randstart)) for j in range(n_randstart): print( "=======================================================================" ) start = clock() communities, graph, Q = pg.cluster(centroids, k=15, directed=False, prune=True, min_cluster_size=2, jaccard=True, primary_metric='euclidean', n_jobs=-1, q_tol=1e-4) # kmeans = KMeans(n_clusters=2, random_state=0).fit(suface_marker_data_normalized) stop = clock() # communities = kmeans.labels_ # c[:,j] = communities #labels c[:, j] = communities q[:, j] = Q #modularity t[:, j] = stop - start #running time # calculate other validation parameters # nmi[:,j] = normalized_mutual_info_score(ground_truth,communities)
import phenograph import numpy import networkx from scipy.sparse import coo_matrix from scipy.io import mmread, mmwrite from numpy import genfromtxt data = genfromtxt('xxCsvPathxx', delimiter=',') communities, graph, Q = phenograph.cluster(data) numpy.savetxt('xxSaveCsvPathxx', communities, delimiter=",") mmwrite('xxSaveGraphPathxx', graph) text_file = open('xxSaveQPathxx', "w") text_file.write(str(Q)) text_file.close() G = networkx.Graph(graph) networkx.write_pajek(G, path = 'xxPajekPathxx', encoding='UTF-8') networkx.write_gml(G, path = 'xxGmlPathxx') networkx.write_edgelist(G, path = 'xxTabPathxx', delimiter = '\t')
def phenograph( data: Union[np.ndarray, spmatrix], *, k: int = 30, directed: bool = False, prune: bool = False, min_cluster_size: int = 10, jaccard: bool = True, primary_metric: str = 'euclidean', n_jobs: int = -1, q_tol: float = 1e-3, louvain_time_limit: int = 2000, nn_method: str = 'kdtree', ) -> Tuple[np.ndarray, spmatrix, float]: """\ PhenoGraph clustering [Levine15]_. Parameters ---------- data Array of data to cluster or sparse matrix of k-nearest neighbor graph. If ndarray, n-by-d array of n cells in d dimensions, if sparse matrix, n-by-n adjacency matrix. k Number of nearest neighbors to use in first step of graph construction. directed Whether to use a symmetric (default) or asymmetric (“directed”) graph. The graph construction process produces a directed graph, which is symmetrized by one of two methods (see below). prune Whether to symmetrize by taking the average (`prune=False`) or product (`prune=True`) between the graph and its transpose. min_cluster_size Cells that end up in a cluster smaller than min_cluster_size are considered outliers and are assigned to -1 in the cluster labels. jaccard If `True`, use Jaccard metric between k-neighborhoods to build graph. If `False`, use a Gaussian kernel. primary_metric : {`'euclidean'`, `'manhattan'`, `'correlation'`, `'cosine'`} Distance metric to define nearest neighbors. Note that performance will be slower for correlation and cosine. n_jobs Nearest Neighbors and Jaccard coefficients will be computed in parallel using `n_jobs`. If `n_jobs=-1`, it is determined automatically. q_tol Tolerance (i.e., precision) for monitoring modularity optimization. louvain_time_limit Maximum number of seconds to run modularity optimization. If exceeded the best result so far is returned. nn_method : {`'kdtree'`, `'brute'`} Whether to use brute force or kdtree for nearest neighbor search. For very large high-dimensional data sets, brute force (with parallel computation) performs faster than kdtree. Returns ------- communities : numpy.ndarray Integer array of community assignments for each row in data. graph : scipy.sparse.spmatrix The graph that was used for clustering. Q : float The modularity score for communities on graph. Example ------- >>> from anndata import AnnData >>> import scanpy as sc >>> import scanpy.external as sce >>> import numpy as np >>> import pandas as pd Assume adata is your annotated data which has the normalized data. Then do PCA: >>> sc.tl.pca(adata, n_comps = 100) Compute phenograph clusters: >>> result = sce.tl.phenograph(adata.obsm['X_pca'], k = 30) Embed the phenograph result into adata as a *categorical* variable (this helps in plotting): >>> adata.obs['pheno'] = pd.Categorical(result[0]) Check by typing "adata" and you should see under obs a key called 'pheno'. Now to show phenograph on tSNE (for example): Compute tSNE: >>> sc.tl.tsne(adata, random_state = 7) Plot phenograph clusters on tSNE: >>> sc.pl.tsne(adata, color = ['pheno'], s = 100, palette = sc.pl.palettes.vega_20_scanpy, legend_fontsize = 10) Cluster and cluster centroids for input Numpy ndarray >>> df = np.random.rand(1000,40) >>> df.shape (1000, 40) >>> result = sce.tl.phenograph(df, k=50) Finding 50 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.16141605377197266 seconds Jaccard graph constructed in 0.7866239547729492 seconds Wrote graph to binary file in 0.42542195320129395 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.223536 After 2 runs, maximum modularity is Q = 0.235874 Louvain completed 22 runs in 1.5609488487243652 seconds PhenoGraph complete in 2.9466471672058105 seconds New results can be pushed into adata object: >>> dframe = pd.DataFrame(data=df, columns=range(df.shape[1]),index=range(df.shape[0]) ) >>> adata = AnnData( X=dframe, obs=dframe, var=dframe) >>> adata.obs['pheno'] = pd.Categorical(result[0]) """ start = logg.info('PhenoGraph clustering') try: import phenograph except ImportError: raise ImportError( 'please install phenograph: ' 'pip3 install git+https://github.com/jacoblevine/phenograph.git') communities, graph, Q = phenograph.cluster( data=data, k=k, directed=directed, prune=prune, min_cluster_size=min_cluster_size, jaccard=jaccard, primary_metric=primary_metric, n_jobs=n_jobs, q_tol=q_tol, louvain_time_limit=louvain_time_limit, nn_method=nn_method, ) logg.info(' finished', time=start) return communities, graph, Q
def RUN_MAIN(): # 1. Load gene expression matrix of simulated data # gene expression with simulated dropouts counts_drop = pd.read_csv('counts_1.csv', header=0, index_col=0) # ground trouth subpopulation assignment cellinfo = pd.read_csv('cellinfo_1.csv', header=0, index_col=0) group = cellinfo.Group label_ground_truth = [] for g in group: g = int(g.split('Group')[1]) label_ground_truth.append(g) # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size) # matrix of cells x genes gene_expression = sc.AnnData(counts_drop.values) # normalize each cell to have same count number sc.pp.normalize_per_cell(gene_expression) # update datastructure to use normalized data gene_expression = gene_expression.X latent_dim = 50 # 3. scScope learning if gene_expression.shape[0] >= 100000: DI_model = DeepImpute.train(gene_expression, latent_dim, T=2, batch_size=512, max_epoch=10, num_gpus=4) else: DI_model = DeepImpute.train(gene_expression, latent_dim, T=2, batch_size=64, max_epoch=300, num_gpus=4) # 4. latent representations and imputed expressions latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model) # 5. graph clustering if latent_code.shape[0] <= 10000: label, _, _ = phenograph.cluster(latent_code) else: label = DeepImpute.scalable_cluster(latent_code) # evaluate ARI = adjusted_rand_score(label, label_ground_truth) print(ARI) X_embedded = TSNE(n_components=2).fit_transform(latent_code) # visualization of the subpopulation using tSNE plt.figure() for i in range(5): idx = np.nonzero(label == i)[0] plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1]) plt.show()