def clusterData(scaledDf,clusteringMethod,clusteringParameters): clusterFunc = clusteringFunctionDict[clusteringMethod] if clusteringMethod == 'phenograph': cluster_labels,graph,Q = clusterFunc(scaledDf,**clusteringParameters) elif clusteringMethod == 'halx': # Your data here n_cells = scaledDf.shape[0] dir_name = os.getcwd().split('/')[-1] os.chdir('outputData/analysisFiles/clusteredData') if 'halxoutput' not in os.listdir(): subprocess.run(['mkdir','halxoutput']) cluster_labels = UHAL(scaledDf, n_cells, name=dir_name,arcsinh=False,scaling=False).run(list(scaledDf.columns), clusteringParameters['cv'], unused=[], output_dir='halxoutput')[0] os.chdir('../../..') elif clusteringMethod == 'parc': Parc1 = parc.PARC(scaledDf.values,**clusteringParameters) Parc1.run_PARC() cluster_labels = Parc1.labels else: if 'n_clusters' in clusteringParameters.keys() and 'distance_threshold' in clusteringParameters.keys(): clusteringParameters['n_clusters'] = None cluster_labels = clusterFunc(**clusteringParameters).fit_predict(scaledDf) #tupleList = [] #for row in range(scaledDf.shape[0]): # names = list(scaledDf.iloc[row,:].name) # tupleList.append(names+[str(cluster_labels[row])]) #clusteredMultiIndex = pd.MultiIndex.from_tuples(tupleList,names=list(scaledDf.index.names)+['Cluster']) #clusteredDf = pd.DataFrame(scaledDf.values,index=clusteredMultiIndex,columns=scaledDf.columns) clusteredDf = scaledDf.assign(Cluster=cluster_labels).set_index('Cluster', append=True) return clusteredDf
def runparc(self): """ function for execution of :return: """ self.log.info("Part2: PARC Clustering") self.log.info("Markers used for Parc clustering:") self.adata_subset = self.adata[:, self.markertoinclude].copy() self.log.info(self.adata_subset.var_names) self.log.info("Markers excluded for Phenograph clustering:") self.log.info(self.marker_array) self.adata_subset.layers['scaled'] = sc.pp.scale(self.adata_subset, max_value=6, zero_center=True, copy=True).X self.adata_subset.X = self.adata_subset.layers['scaled'] p = parc.PARC(self.adata_subset.X, random_seed=42, knn=int(self.k_coef), jac_std_global='median', jac_weighted_edges=False, small_pop=1, num_threads=int(self.thread)) p.run_PARC() self.adata_subset.obs['pheno_leiden'] = [str(i) for i in p.labels] self.adata_subset.obs['pheno_leiden'] = self.adata_subset.obs[ 'pheno_leiden'].astype(int) + 1 self.adata_subset.obs['pheno_leiden'] = self.adata_subset.obs[ 'pheno_leiden'].astype('category') self.adata.obs['cluster'] = self.adata_subset.obs['pheno_leiden'] self.adata.obs['Parc_cluster'] = self.adata_subset.obs[ 'pheno_leiden'].astype('category') if self.harmony is True: self.adata_subset = self.run_harmony() sc.pp.neighbors(self.adata_subset, n_neighbors=10, use_rep='X_pca_harmony') sc.tl.umap(self.adata, min_dist=0.5) self.adata.obsm['X_umap'] = self.embedding else: self.embedding = self.runumap() self.adata.obsm['X_umap'] = self.embedding self.adata_subset.obsm['X_umap'] = self.embedding self.tmp_df = pd.DataFrame(self.adata.X, columns=self.adata.var_names) self.tmp_df['UMAP_1'] = self.embedding[:, 0] self.tmp_df['UMAP_2'] = self.embedding[:, 1] self.tmp_df['Cluster_Parc'] = self.adata_subset.obs['pheno_leiden'] self.plot_umap() self.matrixplot() self.tmp_df.to_csv("/".join([ self.output_folder, "_ConcatenatedCells.".join(["_".join([self.analysis_name]), "csv"]) ]), header=True, index=False) return self.adata
def runparc(self, markertoexclude, adata): """ function for execution of :param adata: :param kcoev: :param thread: :return: """ marker = adata.var_names.to_list() markertoinclude = [i for i in marker if i not in markertoexclude] data = adata[:, markertoinclude].to_df() self.log.info("Markers used for PARC clustering:") self.log.info(data.columns) if (self.scale == True): min_max_scaler = preprocessing.MinMaxScaler((1, 100)) x_scaled = min_max_scaler.fit_transform(data.values) data = pd.DataFrame(x_scaled, columns=data.columns) self.new_head = [] self.new_head.append([column.split("::")[-1] for column in data]) data.columns = self.new_head data.diff().hist(color="k", alpha=0.5, bins=50, grid=False, xlabelsize=8, ylabelsize=8) plt.tight_layout() plt.savefig("/".join([ self.output_folder, ".".join(["_".join([self.analysis_name]), "pdf"]) ])) p = parc.PARC(data.values, random_seed=42, jac_std_global='median', small_pop=100, num_threads=int(self.thread)) p.run_PARC() adata.obs['cluster'] = [str(i) for i in p.labels] adata.obs['Parc_cluster'] = [str(i) for i in p.labels] reducer = umap.UMAP(random_state=42, n_neighbors=10, min_dist=0.001) embedding = reducer.fit_transform(data.values) adata.obsm['X_umap'] = embedding self.tmp_df = self.tmp_df.astype(int) self.tmp_df['UMAP_1'] = embedding[:, 0] self.tmp_df['UMAP_2'] = embedding[:, 1] if (self.tool == "Both"): self.tmp_df['Cluster_Phenograph'] = self.dfPheno self.tmp_df['Cluster_PARC'] = [str(i) for i in p.labels] self.tmp_df.to_csv("/".join([ self.output_folder, ".".join(["_".join([self.analysis_name]), "csv"]) ]), header=True, index=False) return adata
def parc_clustering(pheno, adata, random_state, resolution, parc_too_big_factor, parc_small_pop): # subset the data to be clustered if pheno is not None: cell_subset = adata.obs[adata.obs[sub_cluster_column] == pheno].index else: cell_subset = adata.obs.index # Usage of scaled or raw data if use_raw == True: data_subset = adata[cell_subset] data_subset.X = np.log1p(data_subset.raw.X) else: data_subset = adata[cell_subset] # Phenograph clustering if pheno is not None: print('Parc clustering ' + str(pheno)) else: print('Parc clustering') sc.tl.pca(data_subset) parc1 = parc.PARC(data_subset.obsm['X_pca'], random_seed=random_state, parc_small_pop=parc_small_pop, resolution_parameter=resolution, parc_too_big_factor=parc_too_big_factor) parc1.run_PARC() # Run Parc # Rename the labels cluster_labels = list(map(str, parc1.labels)) if pheno is not None: cluster_labels = list( map(lambda orig_string: pheno + '-' + orig_string, cluster_labels)) # Make it into a dataframe cluster_labels = pd.DataFrame(cluster_labels, index=data_subset.obs.index) # return labels return cluster_labels
def down_sampling(f, down_rate=0.1, clust_id=None, co_factor=5, jac_std_global=0.15, seed=12345): """ Args: - f (file handler): fcs file handler returned by flowio.FlowData() - down_rate (float): down sampling rate - co_factor: coefficient factor to normalize the data - jac_std_global: jac_std_global parameter to run clustering - seed: random seed for reproducibility """ npy_events = np.reshape(f.events, (-1, f.channel_count)) events = np.arcsinh(1. / co_factor * npy_events[:, clust_id]) # clustering with parc parc_obj = parc.PARC(events, jac_std_global=jac_std_global, num_threads=6, random_seed=seed, small_pop=50) parc_obj.run_PARC() clust_label = np.asarray(parc_obj.labels) n_cluster = len(np.unique(clust_label)) # downsampling cells from each cluster downsampled_events = [] for i in range(n_cluster): idx = np.where(clust_label == i)[0] selected_idx = np.random.choice(idx, int(down_rate * len(idx)), replace=False) downsampled_events.append(npy_events[selected_idx]) downsampled_events = np.vstack(downsampled_events) return downsampled_events
import sys import parc import pandas as pd res = float(sys.argv[2]) reduction = pd.read_csv(sys.argv[1], index_col=0) parc1 = parc.PARC(reduction, jac_std_global='median', random_seed=1, small_pop=100, resolution_parameter=res) parc1.run_PARC() parc_labels = parc1.labels out = pd.DataFrame(pd.Categorical(parc_labels)) out['Barcode'] = reduction.index out.to_csv(sys.argv[3])