Exemple #1
0
def clusterData(scaledDf,clusteringMethod,clusteringParameters):
    
    clusterFunc = clusteringFunctionDict[clusteringMethod]
    if clusteringMethod == 'phenograph':
        cluster_labels,graph,Q = clusterFunc(scaledDf,**clusteringParameters)
    elif clusteringMethod == 'halx':
        # Your data here
        n_cells = scaledDf.shape[0]
        dir_name = os.getcwd().split('/')[-1] 
        os.chdir('outputData/analysisFiles/clusteredData')
        if 'halxoutput' not in os.listdir():
            subprocess.run(['mkdir','halxoutput'])
        cluster_labels = UHAL(scaledDf, n_cells, name=dir_name,arcsinh=False,scaling=False).run(list(scaledDf.columns), clusteringParameters['cv'], unused=[], output_dir='halxoutput')[0]
        os.chdir('../../..')
    elif clusteringMethod == 'parc':
        Parc1 = parc.PARC(scaledDf.values,**clusteringParameters)
        Parc1.run_PARC()
        cluster_labels = Parc1.labels
    else:
        if 'n_clusters' in clusteringParameters.keys() and 'distance_threshold' in clusteringParameters.keys():
            clusteringParameters['n_clusters'] = None
        cluster_labels = clusterFunc(**clusteringParameters).fit_predict(scaledDf)
    
    #tupleList = []
    #for row in range(scaledDf.shape[0]):
    #    names = list(scaledDf.iloc[row,:].name)
    #    tupleList.append(names+[str(cluster_labels[row])])
    #clusteredMultiIndex = pd.MultiIndex.from_tuples(tupleList,names=list(scaledDf.index.names)+['Cluster'])
    #clusteredDf = pd.DataFrame(scaledDf.values,index=clusteredMultiIndex,columns=scaledDf.columns) 
    clusteredDf = scaledDf.assign(Cluster=cluster_labels).set_index('Cluster', append=True)

    return clusteredDf
Exemple #2
0
    def runparc(self):
        """
        function for execution of
        :return:
        """
        self.log.info("Part2: PARC Clustering")
        self.log.info("Markers used for Parc clustering:")
        self.adata_subset = self.adata[:, self.markertoinclude].copy()
        self.log.info(self.adata_subset.var_names)
        self.log.info("Markers excluded for Phenograph clustering:")
        self.log.info(self.marker_array)
        self.adata_subset.layers['scaled'] = sc.pp.scale(self.adata_subset,
                                                         max_value=6,
                                                         zero_center=True,
                                                         copy=True).X
        self.adata_subset.X = self.adata_subset.layers['scaled']
        p = parc.PARC(self.adata_subset.X,
                      random_seed=42,
                      knn=int(self.k_coef),
                      jac_std_global='median',
                      jac_weighted_edges=False,
                      small_pop=1,
                      num_threads=int(self.thread))
        p.run_PARC()
        self.adata_subset.obs['pheno_leiden'] = [str(i) for i in p.labels]
        self.adata_subset.obs['pheno_leiden'] = self.adata_subset.obs[
            'pheno_leiden'].astype(int) + 1
        self.adata_subset.obs['pheno_leiden'] = self.adata_subset.obs[
            'pheno_leiden'].astype('category')
        self.adata.obs['cluster'] = self.adata_subset.obs['pheno_leiden']
        self.adata.obs['Parc_cluster'] = self.adata_subset.obs[
            'pheno_leiden'].astype('category')
        if self.harmony is True:
            self.adata_subset = self.run_harmony()
            sc.pp.neighbors(self.adata_subset,
                            n_neighbors=10,
                            use_rep='X_pca_harmony')
            sc.tl.umap(self.adata, min_dist=0.5)
            self.adata.obsm['X_umap'] = self.embedding
        else:
            self.embedding = self.runumap()
            self.adata.obsm['X_umap'] = self.embedding
            self.adata_subset.obsm['X_umap'] = self.embedding

        self.tmp_df = pd.DataFrame(self.adata.X, columns=self.adata.var_names)
        self.tmp_df['UMAP_1'] = self.embedding[:, 0]
        self.tmp_df['UMAP_2'] = self.embedding[:, 1]
        self.tmp_df['Cluster_Parc'] = self.adata_subset.obs['pheno_leiden']
        self.plot_umap()
        self.matrixplot()
        self.tmp_df.to_csv("/".join([
            self.output_folder,
            "_ConcatenatedCells.".join(["_".join([self.analysis_name]), "csv"])
        ]),
                           header=True,
                           index=False)
        return self.adata
 def runparc(self, markertoexclude, adata):
     """
     function for execution of
     :param adata:
     :param kcoev:
     :param thread:
     :return:
     """
     marker = adata.var_names.to_list()
     markertoinclude = [i for i in marker if i not in markertoexclude]
     data = adata[:, markertoinclude].to_df()
     self.log.info("Markers used for PARC clustering:")
     self.log.info(data.columns)
     if (self.scale == True):
         min_max_scaler = preprocessing.MinMaxScaler((1, 100))
         x_scaled = min_max_scaler.fit_transform(data.values)
         data = pd.DataFrame(x_scaled, columns=data.columns)
     self.new_head = []
     self.new_head.append([column.split("::")[-1] for column in data])
     data.columns = self.new_head
     data.diff().hist(color="k",
                      alpha=0.5,
                      bins=50,
                      grid=False,
                      xlabelsize=8,
                      ylabelsize=8)
     plt.tight_layout()
     plt.savefig("/".join([
         self.output_folder,
         ".".join(["_".join([self.analysis_name]), "pdf"])
     ]))
     p = parc.PARC(data.values,
                   random_seed=42,
                   jac_std_global='median',
                   small_pop=100,
                   num_threads=int(self.thread))
     p.run_PARC()
     adata.obs['cluster'] = [str(i) for i in p.labels]
     adata.obs['Parc_cluster'] = [str(i) for i in p.labels]
     reducer = umap.UMAP(random_state=42, n_neighbors=10, min_dist=0.001)
     embedding = reducer.fit_transform(data.values)
     adata.obsm['X_umap'] = embedding
     self.tmp_df = self.tmp_df.astype(int)
     self.tmp_df['UMAP_1'] = embedding[:, 0]
     self.tmp_df['UMAP_2'] = embedding[:, 1]
     if (self.tool == "Both"):
         self.tmp_df['Cluster_Phenograph'] = self.dfPheno
     self.tmp_df['Cluster_PARC'] = [str(i) for i in p.labels]
     self.tmp_df.to_csv("/".join([
         self.output_folder,
         ".".join(["_".join([self.analysis_name]), "csv"])
     ]),
                        header=True,
                        index=False)
     return adata
Exemple #4
0
    def parc_clustering(pheno, adata, random_state, resolution,
                        parc_too_big_factor, parc_small_pop):

        # subset the data to be clustered
        if pheno is not None:
            cell_subset = adata.obs[adata.obs[sub_cluster_column] ==
                                    pheno].index
        else:
            cell_subset = adata.obs.index

        # Usage of scaled or raw data
        if use_raw == True:
            data_subset = adata[cell_subset]
            data_subset.X = np.log1p(data_subset.raw.X)
        else:
            data_subset = adata[cell_subset]

        # Phenograph clustering
        if pheno is not None:
            print('Parc clustering ' + str(pheno))
        else:
            print('Parc clustering')

        sc.tl.pca(data_subset)
        parc1 = parc.PARC(data_subset.obsm['X_pca'],
                          random_seed=random_state,
                          parc_small_pop=parc_small_pop,
                          resolution_parameter=resolution,
                          parc_too_big_factor=parc_too_big_factor)
        parc1.run_PARC()  # Run Parc

        # Rename the labels
        cluster_labels = list(map(str, parc1.labels))
        if pheno is not None:
            cluster_labels = list(
                map(lambda orig_string: pheno + '-' + orig_string,
                    cluster_labels))

        # Make it into a dataframe
        cluster_labels = pd.DataFrame(cluster_labels,
                                      index=data_subset.obs.index)

        # return labels
        return cluster_labels
Exemple #5
0
def down_sampling(f,
                  down_rate=0.1,
                  clust_id=None,
                  co_factor=5,
                  jac_std_global=0.15,
                  seed=12345):
    """
    Args:
        - f (file handler): fcs file handler returned by flowio.FlowData()
        - down_rate (float): down sampling rate
        - co_factor: coefficient factor to normalize the data
        - jac_std_global: jac_std_global parameter to run clustering
        - seed: random seed for reproducibility
    """
    npy_events = np.reshape(f.events, (-1, f.channel_count))
    events = np.arcsinh(1. / co_factor * npy_events[:, clust_id])
    # clustering with parc
    parc_obj = parc.PARC(events,
                         jac_std_global=jac_std_global,
                         num_threads=6,
                         random_seed=seed,
                         small_pop=50)
    parc_obj.run_PARC()
    clust_label = np.asarray(parc_obj.labels)
    n_cluster = len(np.unique(clust_label))

    # downsampling cells from each cluster
    downsampled_events = []
    for i in range(n_cluster):
        idx = np.where(clust_label == i)[0]
        selected_idx = np.random.choice(idx,
                                        int(down_rate * len(idx)),
                                        replace=False)
        downsampled_events.append(npy_events[selected_idx])
    downsampled_events = np.vstack(downsampled_events)

    return downsampled_events
Exemple #6
0
import sys
import parc
import pandas as pd

res = float(sys.argv[2])
reduction = pd.read_csv(sys.argv[1], index_col=0)
parc1 = parc.PARC(reduction,
                  jac_std_global='median',
                  random_seed=1,
                  small_pop=100,
                  resolution_parameter=res)
parc1.run_PARC()

parc_labels = parc1.labels
out = pd.DataFrame(pd.Categorical(parc_labels))
out['Barcode'] = reduction.index
out.to_csv(sys.argv[3])