Example #1
0
def phenograph_clustering(data: pd.DataFrame,
                          features: list,
                          verbose: bool,
                          global_clustering: bool = False,
                          print_performance_metrics: bool = True,
                          **kwargs):
    """
    Perform high-dimensional clustering of single cell data using the popular
    PhenoGraph algorithm (https://github.com/dpeerlab/PhenoGraph)

    Clustering is performed either on the entire dataframe (if global_clustering is True)
    or on each biological sample, in which case a column should be provided called 'sample_id'
    which this function will group on and perform clustering in turn. In both cases,
    the clustering labels are assigned to a new column named 'cluster_label'.

    Parameters
    ----------
    data: Pandas.DataFrame
    features: list
        Columns to peform clustering on
    verbose: bool
        If True, provides a progress bar when global_clustering is False
    global_clustering: bool (default=False)
        Whether to cluster the whole dataframe or group on 'sample_id' and cluster
        groups
    print_performance_metrics: bool = True
        Print Calinski-Harabasz Index, Silhouette Coefficient, and Davies-Bouldin Index
        (see https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)
    kwargs:
        Additional keyword arguments passed when calling phenograph.cluster

    Returns
    -------
    Pandas.DataFrame, scipy.sparse.base.spmatrix, float
        Modified dataframe with clustering IDs assigned to the column 'cluster_label', sparse graph
        matrix, and modularity score for communities (Q)
    """
    _print = vprint(verbose=verbose)
    data["cluster_label"] = None
    if global_clustering:
        communities, graph, q = phenograph.cluster(data[features], **kwargs)
        data["cluster_label"] = communities
        if print_performance_metrics:
            clustering_performance(data[features],
                                   data["cluster_label"].values)
        return data, graph, q
    graphs = dict()
    q = dict()
    for _id, df in data.groupby("sample_id"):
        _print(f"----- Clustering {_id} -----")
        communities, graph, q_ = phenograph.cluster(df[features], **kwargs)
        graphs[_id], q[_id] = graph, q_
        df["cluster_label"] = communities
        data.loc[df.index, ["cluster_label"]] = df.cluster_label
        if print_performance_metrics:
            clustering_performance(df[features], df["cluster_label"].values)
        _print("-----------------------------")
        _print("\n")
    return data, graphs, q
Example #2
0
    def run_phenograph(self,
                       k=30,
                       directed=False,
                       prune=False,
                       min_cluster_size=10,
                       jaccard=True,
                       dis_metric='euclidean',
                       n_jobs=-1,
                       q_tol=1e-3,
                       louvain_time_limit=2000,
                       nn_method='kdtree'):
        communities, graph, Q = phenograph.cluster(
            self.data,
            k=k,
            directed=directed,
            prune=prune,
            min_cluster_size=min_cluster_size,
            jaccard=jaccard,
            primary_metric=dis_metric,
            n_jobs=n_jobs,
            q_tol=q_tol,
            louvain_time_limit=louvain_time_limit,
            nn_method=nn_method)
        communities = np.add(communities, 1)
        cluster_df = pd.DataFrame({'cluster': communities},
                                  index=self.data.index)
        self.clusterinfo = ClusterInfo(cluster_df, graph, Q, 'phenograph')
        if self.parent is not None:
            self.parent.clusterinfo = ClusterInfo(cluster_df, graph, Q,
                                                  'phenograph')

        return communities, Q
Example #3
0
    def apply(self):
        communities, graph, Q = phenograph.cluster(
            data=self.matrix,
            k=self.n_neighbours,
            min_cluster_size=self.min_size,
            n_jobs=self.threads)
        # add 1 to the cluster labels to shift -1 values to zero.
        communities = communities + 1

        self.results = communities

        arr = graph.toarray()
        arr_full = arr + arr.T
        np.fill_diagonal(arr_full, 1)
        dist = (arr_full - arr_full.max()) * (-1)
        np.fill_diagonal(dist, 0)

        self.distance_matrix = dist
        self.modularity = Q

        set_c = set(communities)
        logging.debug('set of communities found by phenograph: ' + str(set_c))

        # make sure at least 2 communities are found
        try:
            assert (len(set_c) > 1)
        except AssertionError as error:
            # Output expected AssertionErrors.
            logging.debug('Less than 2 communities is found')
            sys.exit(1)
Example #4
0
def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs):
    """
    Runs Phenograph on an expression- or PCA-based distance matrix.

    Parameters
    ----------
    distance: ndarray
        cell x cell distance matrix
    k: int (default 20)
        number of nearest neighbors to use
    outdir: str (default '')
    prefix: str (default '')
    label: str (default '')

    Returns
    -------
    communities
    graph
    Q : float

    """
    knn = get_knn(distance, k)
    communities, graph, Q = phenograph.cluster(knn, **kwargs)

    if outdir is not None and len(outdir)>0:
        fileprefix = '{}/{}'.format(outdir, prefix)
        clusterfile = fileprefix + '.pg.txt'
        np.savetxt(clusterfile, communities, fmt='%i')

        logfile = fileprefix + '.pg.info.txt'
        with open(logfile, 'w') as f:
            f.write('k:{}\nQ:{}'.format(k, Q))

    return communities, graph, Q
 def runclustering(self, markertoexclude, adata):
     """
     Function for execution of phenograph analysis
     :param markertoexclude:
     :param adata:
     :return:
     """
     marker = adata.var_names.to_list()
     markertoinclude = [i for i in marker if i not in markertoexclude]
     data = adata[:, markertoinclude].to_df()
     self.log.info("Markers used for Phenograph clustering:")
     self.log.info(data.columns)
     if (self.scale == True):
         min_max_scaler = preprocessing.MinMaxScaler((1, 100))
         x_scaled = min_max_scaler.fit_transform(data.values)
         data = pd.DataFrame(x_scaled, columns=data.columns)
     self.new_head = []
     self.new_head.append([column.split("::")[-1] for column in data])
     data.columns = self.new_head
     data.diff().hist(color="k",
                      alpha=0.5,
                      bins=50,
                      grid=False,
                      xlabelsize=8,
                      ylabelsize=8)
     plt.tight_layout()
     plt.savefig("/".join([
         self.output_folder,
         ".".join(["_".join([self.analysis_name]), "pdf"])
     ]))
     communities, graph, Q = pg.cluster(data.values,
                                        k=int(self.k_coef),
                                        directed=False,
                                        prune=False,
                                        min_cluster_size=1,
                                        n_jobs=int(self.thread))
     # create dataframe with Phenograph output
     self.dfPheno = pd.DataFrame(communities)
     # shift of one unit the name of cluster
     self.dfPheno["Phenograph"] = self.dfPheno[0] + 1
     # remove first column
     self.dfPheno = self.dfPheno.drop(columns=[0], axis=1)
     self.dfPheno.set_index(adata.obs.index, inplace=True)
     adata.obs['cluster'] = self.dfPheno
     adata.obs['Phenograph_cluster'] = self.dfPheno
     reducer = umap.UMAP(random_state=42, n_neighbors=10, min_dist=0.001)
     embedding = reducer.fit_transform(data.values)
     adata.obsm['X_umap'] = embedding
     self.tmp_df = self.tmp_df.astype(int)
     self.tmp_df['UMAP_1'] = embedding[:, 0]
     self.tmp_df['UMAP_2'] = embedding[:, 1]
     self.tmp_df['Cluster_Phenograph'] = self.dfPheno
     self.tmp_df.to_csv("/".join([
         self.output_folder,
         ".".join(["_".join([self.analysis_name]), "csv"])
     ]),
                        header=True,
                        index=False)
     return adata
Example #6
0
def run_phenograph(data):
    print(">>> Running PhenoGraph")
    tic = time.time()
    communities, _, _ = phenograph.cluster(data)
    toc = time.time()
    print("    PhenoGraph found {} clusters".format(len(np.unique(communities))))
    print("    PhenoGraph took {:.2f} s".format(toc - tic))
    return communities
Example #7
0
def adjusted_rand_score_vector(normalized_matrices):
    PCA_model = PCA(n_components=1000, svd_solver='randomized')
    PC_column_names = ['PC' + str(i) for i in list(range(1, 1001))]
    components_normed_data_full = pd.DataFrame(data=PCA_model.fit_transform(
        normalized_matrices[1]),
                                               columns=PC_column_names)
    full_communities, full_graph, full_Q = phenograph.cluster(
        components_normed_data_full)
    adj_rand_scores = []
    for split in list(np.array(range(1, 10)) / 10):
        components_normed_data_downsample = pd.DataFrame(
            data=PCA_model.fit_transform(normalized_matrices[split]),
            columns=PC_column_names)
        downsample_communities, downsample_graph, downsample_Q = phenograph.cluster(
            components_normed_data_downsample)
        adj_rand_scores.append(
            adjusted_rand_score(full_communities, downsample_communities))
    return adj_rand_scores
Example #8
0
def determine_cell_clusters(data, k=50):
    """Run phenograph for clustering cells

    :param data: Principal components of the data.
    :param k: Number of neighbors for kNN graph construction
    :return: Clusters
    """
    # Cluster and cluster centrolds
    communities, _, _ = phenograph.cluster(data, k=k)
    communities = pd.Series(communities, index=data.index)
    return communities
Example #9
0
def phenograph_metaclustering(data: pd.DataFrame,
                              features: list,
                              verbose: bool = True,
                              summary_method: str = "median",
                              scale_method: str or None = None,
                              scale_kwargs: dict or None = None,
                              print_performance_metrics: bool = True,
                              **kwargs):
    """
    Meta-clustering with a the PhenoGraph algorithm. This function
    will summarise the clusters in 'data' (where cluster IDs should be contained in a column
    named 'cluster_label') and then 'cluster the clusters' using the PhenoGraph.
    Parameters
    ----------
    data: Pandas.DataFrame
        Clustered data with columns for sample_id and cluster_label
    features: list
        Columns clustering is performed on
    summary_method: str (default="median")
        How to summarise the clusters for meta-clustering
    print_performance_metrics: bool = True
        Print Calinski-Harabasz Index, Silhouette Coefficient, and Davies-Bouldin Index
        (see https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation)
    verbose: bool (default=True)
        Whether to provide feedback to stdout
    scale_method: str, optional
        Perform scaling of centroids; see cytopy.transform.Scaler
    scale_kwargs: dict, optional
        Additional keyword arguments passed to Scaler
    kwargs:
        Keyword arguments passed to phenograph.cluster

    Returns
    -------
    Pandas.DataFrame
        Updated dataframe with a new column named 'meta_label' with the meta-clustering
        associations
    """
    vprint_ = vprint(verbose)
    vprint_("----- Phenograph meta-clustering ------")
    metadata = summarise_clusters(data, features, scale_method, scale_kwargs,
                                  summary_method)
    vprint_("...summarising clusters")
    vprint_("...clustering the clusters")
    communities, graph, q = phenograph.cluster(metadata[features].values,
                                               **kwargs)
    metadata["meta_label"] = communities
    if print_performance_metrics:
        clustering_performance(metadata[features],
                               metadata["meta_label"].values)
    vprint_("...assigning meta-labels")
    data = _assign_metalabels(data, metadata)
    vprint_("------ Complete ------")
    return data, graph, q
    def _one_fit(self):
        print("\nCreating downsampled doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        print("Normalizing...")
        aug_counts = self.normalizer(
            np.append(self._raw_counts, self._raw_synthetics, axis=0))
        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        print("Running PCA...")
        # Get phenograph results
        pca = PCA(n_components=self.n_components)
        print("Clustering augmented data set with Phenograph...\n")
        reduced_counts = pca.fit_transform(aug_counts)
        fullcommunities, _, _ = phenograph.cluster(
            reduced_counts, **self.phenograph_parameters)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        print("Found communities [{0}, ... {2}], with sizes: {1}\n".format(
            min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_p_values = {
            i: hypergeom.cdf(synth_cells_per_comm[i], aug_counts.shape[0],
                             self._synthetics.shape[0],
                             synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        p_values = np.array([community_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            p_values[self.communities_ == -1] = np.nan

        return scores, p_values
def phenograph_cluster(data, f_save=None, max_samps=None):
    import phenograph
    if max_samps is not None and data.shape[0] > max_samps:
        print("Subsampling")
        #full_data = data.copy()
        data = data[np.random.choice(data.shape[0], max_samps, replace=False), :]

    communities, graph, Q = phenograph.cluster(data, k=100)
    print(communities)
    if f_save is not None:
        f_save = f_save.replace(".p", "") + ".p"
        pickle.dump(communities, open(f_save, "wb"))
    return communities
Example #12
0
    def condense_segmented_clusters(self, segmented_data, min_cluster_size=1):
        # Cluster the segmented counts
        n_cells = segmented_data.shape[0]
        n_regions = segmented_data.shape[1]
        n_neighbours = max(int(n_cells / 10), 2)  # avoid errors
        print(f"n_neighbours to be used: {str(n_neighbours)}")
        if min_cluster_size < 1 and min_cluster_size > 0:
            min_cluster_size = min_cluster_size * n_cells
        min_cluster_size = int(min_cluster_size)
        print(f"Setting min_cluster_size to {min_cluster_size}")

        # Cluster the normalised segmented data
        normalised_segmented_data = segmented_data / np.sum(
            segmented_data, axis=1)[:, np.newaxis]
        communities, graph, Q = phenograph.cluster(
            data=normalised_segmented_data,
            k=n_neighbours,
            n_jobs=1,
            jaccard=True,
            min_cluster_size=min_cluster_size)
        communities_df = pd.DataFrame(communities, columns=["cluster"])
        communities_df["cell_barcode"] = communities_df.index
        communities_df = communities_df[["cell_barcode", "cluster"]]
        community_dict = dict((Counter(communities)))
        community_ids = sorted(list(community_dict))

        # Compute (unnormalised) average counts of each cluster
        avg_segmented_counts = np.empty(segmented_data.shape)
        condensed_avg_segmented_counts = np.empty(
            (len(community_ids), n_regions))
        cluster_sizes = np.zeros((len(community_ids), ))

        # Offset -1 if there is one
        if np.min(community_ids) == -1:
            communities = np.array(communities) + 1
            community_ids = np.array(community_ids) + 1

        for id in community_ids:
            # Use robust mean?
            avg_segmented_counts[np.where(communities == id)[0]] = np.mean(
                segmented_data[np.where(communities == id)[0], :], axis=0)
            condensed_avg_segmented_counts[id] = avg_segmented_counts[np.where(
                communities == id)[0][0], :]
            cluster_sizes[id] = np.where(communities == id)[0].shape[0]

        print(f"Found {len(community_ids)} clusters.")
        print(f"Cluster sizes: {cluster_sizes}")

        self.cluster_assignments = communities

        return condensed_avg_segmented_counts, cluster_sizes, communities, Q
Example #13
0
def phenograph_metaclustering(data: pd.DataFrame,
                              features: list,
                              verbose: bool = True,
                              summary_method: callable = np.median,
                              norm_method: str or None = "norm",
                              norm_kwargs: dict or None = None,
                              **kwargs):
    """
    Meta-clustering with a the PhenoGraph algorithm. This function
    will summarise the clusters in 'data' (where cluster IDs should be contained in a column
    named 'cluster_id') and then 'cluster the clusters' using the PhenoGraph.
    Parameters
    ----------
    data: Pandas.DataFrame
        Clustered data with columns for sample_id and cluster_id
    features: list
        Columns clustering is performed on
    summary_method: callable
        Function to apply to each sample_id/cluster_id group to summarise the
        clusters for meta-clustering
    norm_method: str or None
        If provided, method used to normalise data prior to summarising
    norm_kwargs: dict, optional
        Additional keyword arguments passed to CytoPy.flow.transform.scaler
    verbose: bool (default=True)
        Whether to provide feedback to stdout
    kwargs:
        Keyword arguments passed to phenograph.cluster

    Returns
    -------
    Pandas.DataFrame
        Updated dataframe with a new column named 'meta_label' with the meta-clustering
        associations
    """
    vprint_ = vprint(verbose)
    vprint_("----- Phenograph meta-clustering ------")
    norm_kwargs = norm_kwargs or {}
    metadata = _meta_preprocess(data, features, summary_method, norm_method,
                                **norm_kwargs)
    vprint_("...summarising clusters")
    vprint_("...clustering the clusters")
    communities, graph, q = phenograph.cluster(metadata[features].values,
                                               **kwargs)
    metadata["meta_label"] = communities
    vprint_("...assigning meta-labels")
    data = _asign_metalabels(data, metadata)
    vprint_("------ Complete ------")
    return data, graph, q
Example #14
0
def cluster_gene_trends(trends, k=150, n_jobs=-1):
    """Function to cluster gene trends
    :param trends: Matrix of gene expression trends
    :param k: K for nearest neighbor construction
    :param n_jobs: Number of jobs for parallel processing
    :return: Clustering of gene trends
    """

    # Standardize the trends
    trends = pd.DataFrame(StandardScaler().fit_transform(trends.T).T,
                          index=trends.index,
                          columns=trends.columns)

    # Cluster
    clusters, _, _ = phenograph.cluster(trends, k=k, n_jobs=n_jobs)
    clusters = pd.Series(clusters, index=trends.index)
    return clusters
Example #15
0
def run_phenograph_approx_knn(X, k=20, outdir='', prefix='', **kwargs):
    """
    Runs Phenograph on an expression- or PCA-based distance matrix.

    Parameters
    ----------
    X: ndarray
        cell x feature data matrix
    k: int (default 20)
        number of nearest neighbors to use
    outdir: str (default '')
    prefix: str (default '')
    label: str (default '')

    Returns
    -------
    communities
    graph
    Q : float

    """
    assert X.shape[0] > X.shape[1]
    fileprefix = '{}/{}'.format(outdir, prefix)
    knn_file = f'{fileprefix}.knn{k}_approx.mtx'
    if os.path.exists(knn_file):
        knn = mmread(knn_file)
    else:
        knn = get_approx_knn(X, k)  #.tolil()
        mmwrite(knn_file, knn)

    print(83, knn.shape)
    communities, graph, Q = phenograph.cluster(knn, **kwargs)

    if outdir is not None and len(outdir) > 0:
        fileprefix = '{}/{}'.format(outdir, prefix)
        clusterfile = fileprefix + '.pg.txt'
        np.savetxt(clusterfile, communities, fmt='%i')

        logfile = fileprefix + '.pg.info.txt'
        with open(logfile, 'w') as f:
            f.write('k:{}\nQ:{}'.format(k, Q))

    return communities, graph, Q
    def plot_phenograph(dataset='big clusters',
                        primary_metric='euclidean',
                        lowrank=False,
                        k=30, min_cluster_size=10):
        key = dataset + ' lowrank' if lowrank else dataset
        corr = correls[key]

        if dataset == 'big clusters':
            metadata = big_clusters_cells
            palette = 'Set2'
            cluster_col = 'cluster_id'
        elif dataset == 'amacrine':
            metadata = amacrine_cells
            palette = 'husl'
            cluster_col = 'cluster_id'
        community_col = 'community'

        communities, graph, Q = phenograph.cluster(
            corr, k=k, primary_metric=primary_metric,
            min_cluster_size=min_cluster_size)
        network = networkx.from_scipy_sparse_matrix(graph)
        positions = networkx.spring_layout(network)

        nodes_source = ColumnDataSource(get_nodes_specs(
            positions, metadata, corr.index, communities,
            other_cluster_col=cluster_col,
            community_col=community_col, palette=palette))
        edges_source = ColumnDataSource(get_edges_specs(network, positions))

        # --- First tab: KNN clustering --- #
        tab1 = plot_graph(nodes_source, edges_source, legend_col=community_col,
                          color_col=f'{community_col}_color', tab=True,
                          title='KNN Clustering')

        # --- Second tab: Clusters from paper --- #
        tab2 = plot_graph(nodes_source, edges_source,
                          legend_col='cluster_n_celltype', tab=True,
                          color_col='other_cluster_color',
                          title="Clusters from paper")

        tabs = Tabs(tabs=[tab1, tab2])
        show(tabs)
Example #17
0
def pg_cluster(file_name, k=30, min_cluster_size=10):
    '''
	Run PhenoGraph clustering
	
	:param file_name: file base name
	:param k: kNN's K
	:param min_cluster_size: minimal number of grouped points to form cluster
	'''
    file_path_single_normalized = build_file_path(file_name,
                                                  suffix='normalized')
    file_path_single_labeled = build_file_path(file_name, suffix='labeled')
    file_path_single_cluster = build_file_path(file_name,
                                               suffix=COL_NAME_CLUSTER)

    markers = read_markers()
    data = load_csv(file_path_single_normalized).filter(
        items=markers).as_matrix()
    assert data.shape[1] == len(markers)
    print('\tCluster {} points in {}'.format(data.shape[0], file_name),
          flush=True)
    communities, graph, Q = cluster(data,
                                    k=k,
                                    nn_method='kdtree',
                                    min_cluster_size=min_cluster_size)
    data, graph = None, None
    print('Found {} clusters'.format(len(unique(communities))), flush=True)

    frame = load_csv(file_path_single_normalized)
    frame[COL_NAME_CLUSTER] = communities
    save_csv(frame, file_path_single_labeled)

    # medians & counts
    cluster_frame = frame.groupby(COL_NAME_CLUSTER, as_index=False).median()
    cluster_frame = cluster_frame[
        cluster_frame[COL_NAME_CLUSTER] !=
        -1]  # skip -1 which means under min cluster size
    cluster_frame[COL_NAME_COUNT_CELL] = frame.groupby(
        COL_NAME_CLUSTER)[COL_NAME_CLUSTER].count()
    save_csv(cluster_frame, file_path_single_cluster)

    print('Clustering successful', flush=True)
Example #18
0
def tsne(raw_counts, labels, n_components=30, n_jobs=-1, show=False, save=None):
    """Produce a tsne plot of the data with doublets in black

    Args:
        raw_counts (ndarray): cells by genes count matrix
        labels (ndarray): predicted doublets from predict method
        n_components (int, optional): number of PCs to use prior to TSNE
        n_jobs (int, optional): number of cores to use for TSNE, -1 for all
        show (bool, optional): If True, runs plt.show()
        save (str, optional): filename for saved figure,
            figure not saved by default
    Returns:
        matplotlib figure
        ndarray: tsne reduction
    """
    norm_counts = normalize_counts(raw_counts)
    reduced_counts = PCA(n_components=n_components,
                         svd_solver='randomized').fit_transform(norm_counts)
    communities, _, _ = phenograph.cluster(reduced_counts)
    tsne_counts = TSNE(n_jobs=-1).fit_transform(reduced_counts)

    fig, axes = plt.subplots(1, 1, figsize=(3, 3), dpi=200)
    axes.scatter(tsne_counts[:, 0], tsne_counts[:, 1],
                 c=communities, cmap=plt.cm.tab20, s=1)
    axes.scatter(tsne_counts[:, 0][labels], tsne_counts[:, 1]
                 [labels], s=3, edgecolor='k', facecolor='k')
    axes.set_title('Cells with Detected\n Doublets in Black')
    plt.xticks([])
    plt.yticks([])
    axes.set_xlabel('{} doublets out of {} cells.\n {}%  across-type doublet rate.'.format(
        np.sum(labels), raw_counts.shape[0], np.round(100 * np.sum(labels) / raw_counts.shape[0], 2)))

    if show is True:
        plt.show()
    if isinstance(save, str):
        fig.savefig(save, format='pdf', bbox_inches='tight')

    return fig, tsne_counts
Example #19
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='author:	{0}	mail:	{1}'.format(__author__, __mail__))
    parser.add_argument('-s',
                        '--seq',
                        help='fasta file',
                        dest='seq',
                        required=True)
    parser.add_argument('-s',
                        '--seq',
                        help='fasta file',
                        dest='seq',
                        default=None)
    parser.add_argument('-s', '--seq', help='fasta file', action='store_true')
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG,
        format=
        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s"
    )

    file = open(
        "/asnas/wangqf_group/suyx/Project_scRNA_seq/Analysis/PublicData/cell/GSE116256/suppl/GSM3587923_AML1012-D0.dem.txt"
    )
    data = file.readlines()
    file.close()
    temp = data[:]
    cellID = data[0].strip().split("\t")[1:]
    gene = [i.strip().split("\t")[0] for i in data[1:]]
    counts = [i.strip().split("\t")[1:] for i in data[1:]]
    counts = [[int(j) for j in i] for i in counts]
    counts_np = numpy.array(counts)
    counts_np = counts_np.T
    communities, graph, Q = phenograph.cluster(counts_np)
Example #20
0
PATH_M = 'output/CD/Euclidean/'
FILENAMES = sorted(listdir(PATH_M))
k_ = 30
ari = np.zeros((14))
v = np.zeros((14))
n_clusters = np.zeros((14))
index = []

for i in np.arange(1, 15):
    df = pd.read_table(PATH_M + str(i) + '.' + str(i),
                       sep=',',
                       index_col=None,
                       header=None)
    features = list(df.columns)[:-1]
    target = df.iloc[:, -1]
    communities, graph, Q = phenograph.cluster(df.loc[:, features],
                                               k=k_,
                                               primary_metric='Euclidean')
    ari[i - 1] = adjusted_rand_score(target, communities)
    v[i - 1] = v_measure_score(target, communities)
    n_clusters[i - 1] = len(np.unique(communities))

    print(i, v[i - 1], n_clusters[i - 1])

ari = pd.DataFrame(ari, index=np.arange(1, 15))
v = pd.DataFrame(v, index=np.arange(1, 15))
n_clusters = pd.DataFrame(n_clusters, index=np.arange(1, 15))

ari.to_csv('output/CD/ARI_k=' + str(k_) + '_E_rerun.csv')
v.to_csv('output/CD/V_k=' + str(k_) + '_E_rerun.csv')
n_clusters.to_csv('output/CD/N_clusters_k=' + str(k_) + '_E_rerun.csv')
Example #21
0
import argparse
import pandas as pd
import phenograph

# This script runs Phenograph 

# Get input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--data', help='Infile (rows = cells, columns = features)')
parser.add_argument('--out', help='Outfile (clusters)')
parser.add_argument('-k', help='Number of neighbors for kNN graph', type=int, default=50)
parser.add_argument('--metric', help='Distance metric to use', choices=['manhattan', 'euclidean', 'cosine', 'correlation'], default='cosine')
parser.add_argument('--ncores', help='Number of cores to use', type=int, default=-1)
args = parser.parse_args()

# Run phenograph
data = pd.read_table(args.data)
communities, graph, Q = phenograph.cluster(data, k=args.k, primary_metric=args.metric, n_jobs=args.ncores)

# Write output
out = open(args.out, 'w')
for xi in communities:
    out.write('%s\n' %(xi))
out.close()
Example #22
0
    normalise = args.normalise
    n_neighbours = args.n_neighbours
    input_data = np.loadtxt(input_data_file, delimiter=',')

    # Cluster the segmented counts
    N, P = input_data.shape
    K = max(int(N / 10), 2)  # avoid errors
    if n_neighbours:
        K = int(n_neighbours)

    print(f"n_neighbours to be used: {str(K)}")

    # Cluster the normalised segmented data
    if normalise:
        print("Will cluster normalised data.")
        input_data = input_data / np.sum(input_data, axis=1)[:, np.newaxis] * P
    communities, graph, Q = phenograph.cluster(data=input_data,
                                               k=K,
                                               n_jobs=1,
                                               jaccard=True,
                                               min_cluster_size=1,
                                               seed=42)

    print(f"Found {len(np.unique(communities))} clusters.")

    input_data_file = os.path.splitext(input_data_file)[0]
    out_file = input_data_file + '_cluster_assignments.txt'
    np.savetxt(out_file, communities, delimiter=",")

    print(f"Saved the cluster assignments into {out_file}.")
Example #23
0
def scalable_cluster(latent_code,
                     kmeans_num=500,
                     cluster_num=400,
                     display_step=50,
                     phenograh_neighbor=30
                     ):
    '''
    Scalable  cluster:
    To perform graph clustering on large-scale data, we designed a scalable clustering strategy by combining k-means and PhenoGraph.
    Briefly, we divide cells into M (kmeans_num) groups of equal size and perform K-means (cluster_num) clustering on each group independently. 
        The whole dataset is split to M×K clusters and we only input the cluster centroids into PhenoGraph for graph clustering. 
        Finally, each cell is assigned to graph clusters according to the cluster labels of its nearest centroids.

    Parameters:

        latent_code:    n*m matrix; n = number of cells, m = dimension of feature representation.
        kmeans_num:     number of independent K-means clusterings used. This is also the subset number.
        cluster_num:    cluster number for each K-means clustering. This is also the "n_clusters" in KMeans function in sklearn package.
        display_step:   displaying the process of K-means clustering.
        phenograh_neighbor: "k" parameter in PhenoGraph package.

    Output:

        Cluster labels for input cells.


    Altschuler & Wu Lab 2018. 
    Software provided as is under Apache License 2.0.
    '''

    print('Scalable clustering:')
    print('Use %d subsets of cells for initially clustering...' % kmeans_num)

    stamp = np.floor(np.linspace(0, latent_code.shape[0], kmeans_num + 1))
    stamp = stamp.astype(int)

    cluster_ceter = np.zeros([kmeans_num * cluster_num, latent_code.shape[1]])
    mapping_sample_kmeans = np.zeros(latent_code.shape[0])

    for i in range(kmeans_num):

        low_bound = stamp[i]
        upp_bound = stamp[i + 1]
        sample_range = np.arange(low_bound, upp_bound)
        select_sample = latent_code[sample_range, :]

        kmeans = KMeans(n_clusters=cluster_num,
                        random_state=0).fit(select_sample)
        label = kmeans.labels_

        for j in range(cluster_num):
            cluster_sample_idx = np.nonzero(label == j)[0]
            cluster_sample = select_sample[cluster_sample_idx, :]
            cluster_ceter[i * cluster_num + j,
                          :] = np.mean(cluster_sample, axis=0)
            mapping_sample_kmeans[sample_range[cluster_sample_idx]
                                  ] = i * cluster_num + j

        if i % display_step == 0:
            print('\tK-means clustering for %d subset.' % i)

    print('Finish intially clustering by K-means.')
    print('Start PhenoGraph clustering...\n')

    label_pheno, graph, Q = phenograph.cluster(
        cluster_ceter, k=phenograh_neighbor, n_jobs=1)

    label = np.zeros(latent_code.shape[0])
    for i in range(label_pheno.max() + 1):
        center_index = np.nonzero(label_pheno == i)[0]
        for j in center_index:
            sample_index = np.nonzero(mapping_sample_kmeans == j)[
                0]  # samples belong to this center
            label[sample_index] = i
    print('Finish density down-sampling clustering.')

    return label
Example #24
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import v_measure_score, accuracy_score
from sklearn.model_selection import LeaveOneOut
from statsmodels.stats.multitest import multipletests

patient_id = [1, 9, 14, 19, 20, 22][0]
df_E = pd.read_csv('Output_TDMLMJ/Euclidean/Euclidean_' + str(patient_id),
                   index_col=None,
                   header=None)
labels = pd.read_csv('Labels_test_CD_CTRL_HMIS2.csv', index_col=0, header=0)
for i in np.arange(0, 28):
    labels.ix[np.int(i * 10000):np.int((i + 1) * 10000), 'Sample ID'] = i + 1

features_E = list(df_E.columns[:-1])
communities_E, graph_E, Q_E = phenograph.cluster(df_E.loc[:, features_E], k=30)
labels['PhenoGraph_E'] = communities_E

ct_E = pd.crosstab(labels.loc[:, 'Sample ID'],
                   labels.loc[:, 'PhenoGraph_E'],
                   normalize=True)
wrs_E = np.zeros(len(ct_E.columns))
p_wrs_E = np.zeros(len(ct_E.columns))

for cluster in ct_E.columns:
    wrs_E[cluster], p_wrs_E[cluster] = stats.mannwhitneyu(
        ct_E.iloc[0:14, cluster], ct_E.iloc[14:, cluster])

q_wrs_E = multipletests(p_wrs_E, method='fdr_bh')[1]

results_E = pd.DataFrame({'WRS': wrs_E, 'P': p_wrs_E, 'Q': q_wrs_E})
        print(term_df.head())
        gc.collect()

# Vectorize
vectorizer = TfidfVectorizer(stop_words="english",
                             strip_accents="ascii",
                             max_features=2**12)
X = vectorizer.fit_transform(tqdm(term_df["body_text"].values))
gc.collect()
print(X.shape)

# In[5]:

#%%
# Louvain clustering of text
communities, graph, Q = phenograph.cluster(X, k=100)
num_lclusters = len(set(communities))
print("Louvain clustering classified the papers into {} clusters".format(
    num_lclusters))

#K-means clustering of text
k = num_lclusters  # equate the number of clusters for both for comparison
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
num_kclusters = len(set(y_pred))
print("K-means clustering classified the papers into {} clusters".format(
    num_kclusters))

# In[6]:

# Dimensionality reduction with UMAP
Example #26
0
scdata = magic.mg.SCData.from_csv(
    "/Users/vincentliu/Desktop/Pe'er Lab/Summer 2017/Data/pbmc_4k_short.csv")

# log transform the data with default pseudocount 0.1
print("log-transforming the data...")
scdata.log_transform_scseq_data()
"""
# run tsne on the processed data and store the tsne values into a pd dataframe
print("running tSNE on the data...")
scdata.run_tsne()
tsne = scdata.tsne"""

# run phenograph on the processed data
print("starting PhenoGraph...")
processed = scdata.data
communities, graph, Q = phenograph.cluster(processed, k=15)
print(len(communities))
communities = ['0' if x == 3 else '1' for x in communities]
print(len(communities))
"""
toPlot = tsne.assign(com=pd.Series(communities).values)
clusterRec = {}
for index, row in toPlot.iterrows():
    if row['com'] in clusterRec:
        count = clusterRec[row['com']][2]
        new1 = (clusterRec[row['com']][0] * count + row['tSNE1']) / (count+1)
        new2 = (clusterRec[row['com']][1] * count + row['tSNE2']) / (count+1)
        clusterRec[row['com']] = [new1, new2, count+1]
    else:
        clfusterRec[row['com']] = [row['tSNE1'], row['tSNE2'], 1]
Example #27
0
#!/usr/bin/python
import phenograph
import numpy

## Read data.
data = numpy.loadtxt(
    'intersect_all_first_1m.tsv.gz')  #, dtype=<class 'integer'>)

## Run phenograph
communities, graph, Q = phenograph.cluster(data)

## Save.
numpy.savetxt('communities.txt.gz', communities, fmt='%.d')
#numpy.savetxt('graph.txt.gz', graph)
Example #28
0
def muse_fit_predict(data_x,
                     data_y,
                     label_x,
                     label_y,
                     latent_dim=100,
                     n_epochs=500,
                     lambda_regul=5,
                     lambda_super=5):
    """
        MUSE model fitting and predicting:
          This function is used to train the MUSE model on multi-modality data

        Parameters:
          data_x:       input for transcript modality; matrix of  n * p, where n = number of cells, p = number of genes.
          data_y:       input for morphological modality; matrix of n * q, where n = number of cells, q is the feature dimension.
          label_x:      initial reference cluster label for transcriptional modality.
          label_y:      inital reference cluster label for morphological modality.
          latent_dim:   feature dimension of joint latent representation.
          n_epochs:     maximal epoch used in training.
          lambda_regul: weight for regularization term in the loss function.
          lambda_super: weight for supervised learning loss in the loss function.

        Output:
          latent:       joint latent representation learned by MUSE.
          reconstruct_x:reconstructed feature matrix corresponding to input data_x.
          reconstruct_y:reconstructed feature matrix corresponding to input data_y.
          latent_x:     modality-specific latent representation corresponding to data_x.
          latent_y:     modality-specific latent representation corresponding to data_y.

        Feng Bao @ Altschuler & Wu Lab @ UCSF 2022.
        Software provided as is under MIT License.
    """

    """ initial parameter setting """
    # parameter setting for neural network
    n_hidden = 128  # number of hidden node in neural network
    learn_rate = 1e-4  # learning rate in the optimization
    batch_size = 64  # number of cells in the training batch
    n_epochs_init = 200  # number of training epoch in model initialization
    print_epochs = 50  # epoch interval to display the current training loss
    cluster_update_epoch = 200  # epoch interval to update modality-specific clusters

    # read data-specific parameters from inputs
    feature_dim_x = data_x.shape[1]
    feature_dim_y = data_y.shape[1]
    n_sample = data_x.shape[0]

    # GPU configuration
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    """ construct computation graph using TensorFlow """
    tf.reset_default_graph()

    # raw data from two modalities
    x = tf.placeholder(tf.float32, shape=[None, feature_dim_x], name='input_x')
    y = tf.placeholder(tf.float32, shape=[None, feature_dim_y], name='input_y')

    # labels inputted for references
    ref_label_x = tf.placeholder(tf.float32, shape=[None], name='ref_label_x')
    ref_label_y = tf.placeholder(tf.float32, shape=[None], name='ref_label_y')

    # hyperparameter in triplet loss
    triplet_lambda = tf.placeholder(tf.float32, name='triplet_lambda')
    triplet_margin = tf.placeholder(tf.float32, name='triplet_margin')

    # network architecture
    z, x_hat, y_hat, encode_x, encode_y, loss, \
    reconstruction_error, weight_penalty, \
    trip_loss_x, trip_loss_y = structured_embedding(x,
                                                    y,
                                                    ref_label_x,
                                                    ref_label_y,
                                                    latent_dim,
                                                    triplet_margin,
                                                    n_hidden,
                                                    lambda_regul,
                                                    triplet_lambda)
    # optimization operator
    train_op = tf.train.AdamOptimizer(learn_rate).minimize(loss)
    print('++++++++++ MUSE for multi-modality single-cell analysis ++++++++++')
    """ MUSE optimization """
    total_batch = int(n_sample / batch_size)

    with tf.Session() as sess:

        """ initialization of autoencoder architecture for MUSE """
        print('MUSE initialization')
        # global parameter initialization
        sess.run(tf.global_variables_initializer(), feed_dict={triplet_lambda: 0,
                                                               triplet_margin: 0})

        for epoch in range(n_epochs_init):
            # randomly permute samples
            random_idx = np.random.permutation(n_sample)
            data_train_x = data_x[random_idx, :]
            data_train_y = data_y[random_idx, :]

            for i in range(total_batch):
                # input data batches
                offset = (i * batch_size) % (n_sample)
                batch_x_input = data_train_x[offset:(offset + batch_size), :]
                batch_y_input = data_train_y[offset:(offset + batch_size), :]

                # initialize parameters without self-supervised loss (triplet_lambda=0)
                sess.run(train_op,
                         feed_dict={x: batch_x_input,
                                    y: batch_y_input,
                                    ref_label_x: np.zeros(batch_x_input.shape[0]),
                                    ref_label_y: np.zeros(batch_y_input.shape[0]),
                                    triplet_lambda: 0,
                                    triplet_margin: 0})

            # calculate and print loss terms for current epoch
            if epoch % print_epochs == 0:
                L_total, L_reconstruction, L_weight = \
                    sess.run((loss, reconstruction_error, weight_penalty),
                             feed_dict={x: data_train_x,
                                        y: data_train_y,
                                        ref_label_x: np.zeros(data_train_x.shape[0]),  # no use as triplet_lambda=0
                                        ref_label_y: np.zeros(data_train_y.shape[0]),  # no use as triplet_lambda=0
                                        triplet_lambda: 0,
                                        triplet_margin: 0})

                print(
                    "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f"
                    % (epoch, L_total, L_reconstruction, L_weight))

        # estimate the margin for the triplet loss
        latent, reconstruct_x, reconstruct_y = \
            sess.run((z, x_hat, y_hat),
                     feed_dict={x: data_x,
                                y: data_y,
                                ref_label_x: np.zeros(data_x.shape[0]),
                                ref_label_y: np.zeros(data_y.shape[0]),
                                triplet_lambda: 0,
                                triplet_margin: 0})
        latent_pd_matrix = pdist(latent, 'euclidean')
        latent_pd_sort = np.sort(latent_pd_matrix)
        select_top_n = np.int(latent_pd_sort.size * 0.2)
        margin_estimate = np.median(latent_pd_sort[-select_top_n:]) - np.median(latent_pd_sort[:select_top_n])

        # refine MUSE parameters with reference labels and triplet losses
        for epoch in range(n_epochs_init):
            # randomly permute samples
            random_idx = np.random.permutation(n_sample)
            data_train_x = data_x[random_idx, :]
            data_train_y = data_y[random_idx, :]
            label_train_x = label_x[random_idx]
            label_train_y = label_y[random_idx]

            for i in range(total_batch):
                # data batches
                offset = (i * batch_size) % (n_sample)
                batch_x_input = data_train_x[offset:(offset + batch_size), :]
                batch_y_input = data_train_y[offset:(offset + batch_size), :]
                label_x_input = label_train_x[offset:(offset + batch_size)]
                label_y_input = label_train_y[offset:(offset + batch_size)]

                # refine parameters
                sess.run(train_op,
                         feed_dict={x: batch_x_input,
                                    y: batch_y_input,
                                    ref_label_x: label_x_input,
                                    ref_label_y: label_y_input,
                                    triplet_lambda: lambda_super,
                                    triplet_margin: margin_estimate})

            # calculate loss on all input data for current epoch
            if epoch % print_epochs == 0:
                L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y = \
                    sess.run((loss, reconstruction_error, weight_penalty, trip_loss_x, trip_loss_y),
                             feed_dict={x: data_train_x,
                                        y: data_train_y,
                                        ref_label_x: label_train_x,
                                        ref_label_y: label_train_y,
                                        triplet_lambda: lambda_super,
                                        triplet_margin: margin_estimate})

                print(
                    "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f,\t x triplet: %03.5f,\t y triplet: %03.5f"
                    % (epoch, L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y))

        # update cluster labels based modality-specific latents
        latent_x, latent_y = \
            sess.run((encode_x, encode_y),
                     feed_dict={x: data_x,
                                y: data_y,
                                ref_label_x: label_x,
                                ref_label_y: label_y,
                                triplet_lambda: lambda_super,
                                triplet_margin: margin_estimate})

        # update cluster labels using PhenoGraph
        label_x_update, _, _ = phenograph.cluster(latent_x)
        label_y_update, _, _ = phenograph.cluster(latent_y)
        print('Finish initialization of MUSE')

        ''' Training of MUSE '''
        for epoch in range(n_epochs):
            # randomly permute samples
            random_idx = np.random.permutation(n_sample)
            data_train_x = data_x[random_idx, :]
            data_train_y = data_y[random_idx, :]
            label_train_x = label_x_update[random_idx]
            label_train_y = label_y_update[random_idx]

            # loop over all batches
            for i in range(total_batch):
                # batch data
                offset = (i * batch_size) % (n_sample)
                batch_x_input = data_train_x[offset:(offset + batch_size), :]
                batch_y_input = data_train_y[offset:(offset + batch_size), :]
                batch_label_x_input = label_train_x[offset:(offset + batch_size)]
                batch_label_y_input = label_train_y[offset:(offset + batch_size)]

                sess.run(train_op,
                         feed_dict={x: batch_x_input,
                                    y: batch_y_input,
                                    ref_label_x: batch_label_x_input,
                                    ref_label_y: batch_label_y_input,
                                    triplet_lambda: lambda_super,
                                    triplet_margin: margin_estimate})

            # calculate and print losses on whole training dataset
            if epoch % print_epochs == 0:
                L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y = \
                    sess.run((loss, reconstruction_error, weight_penalty, trip_loss_x, trip_loss_y),
                             feed_dict={x: data_train_x,
                                        y: data_train_y,
                                        ref_label_x: label_train_x,
                                        ref_label_y: label_train_y,
                                        triplet_lambda: lambda_super,
                                        triplet_margin: margin_estimate})
                # print cost every epoch
                print(
                    "epoch: %d, \t total loss: %03.5f,\t reconstruction loss: %03.5f,\t sparse penalty: %03.5f,\t x triplet loss: %03.5f,\t y triplet loss: %03.5f"
                    % (epoch, L_total, L_reconstruction, L_weight, L_trip_x, L_trip_y))

            # update cluster labels based on new modality-specific latent representations
            if epoch % cluster_update_epoch == 0:
                latent_x, latent_y = \
                    sess.run((encode_x, encode_y),
                             feed_dict={x: data_x,
                                        y: data_y,
                                        ref_label_x: label_x,
                                        ref_label_y: label_y,
                                        triplet_lambda: lambda_super,
                                        triplet_margin: margin_estimate})

                # use PhenoGraph to obtain cluster label
                label_x_update, _, _ = phenograph.cluster(latent_x)
                label_y_update, _, _ = phenograph.cluster(latent_y)

        """ MUSE output """
        latent, reconstruct_x, reconstruct_y, latent_x, latent_y = \
            sess.run((z, x_hat, y_hat, encode_x, encode_y),
                     feed_dict={x: data_x,
                                y: data_y,
                                ref_label_x: label_x,  # no effects to representations
                                ref_label_y: label_y,  # no effects to representations
                                triplet_lambda: lambda_super,
                                triplet_margin: margin_estimate})

        print('++++++++++ MUSE completed ++++++++++')

    return latent, reconstruct_x, reconstruct_y, latent_x, latent_y
Example #29
0
q = np.zeros((1, n_randstart))
t = np.zeros((1, n_randstart))
#        nmi = np.zeros((1,n_randstart))
#        ari = np.zeros((1,n_randstart))
#        fm = np.zeros((1,n_randstart))

for j in range(n_randstart):
    print(
        "======================================================================="
    )
    start = clock()
    communities, graph, Q = pg.cluster(centroids,
                                       k=15,
                                       directed=False,
                                       prune=True,
                                       min_cluster_size=2,
                                       jaccard=True,
                                       primary_metric='euclidean',
                                       n_jobs=-1,
                                       q_tol=1e-4)
    #            kmeans = KMeans(n_clusters=2, random_state=0).fit(suface_marker_data_normalized)
    stop = clock()

    #            communities = kmeans.labels_
    #            c[:,j] = communities   #labels
    c[:, j] = communities
    q[:, j] = Q  #modularity
    t[:, j] = stop - start  #running time

#            calculate other validation parameters
#            nmi[:,j] = normalized_mutual_info_score(ground_truth,communities)
Example #30
0
import phenograph
import numpy
import networkx
from scipy.sparse import coo_matrix
from scipy.io import mmread, mmwrite
from numpy import genfromtxt
data = genfromtxt('xxCsvPathxx', delimiter=',')
communities, graph, Q = phenograph.cluster(data)
numpy.savetxt('xxSaveCsvPathxx', communities, delimiter=",")
mmwrite('xxSaveGraphPathxx', graph)
text_file = open('xxSaveQPathxx', "w")
text_file.write(str(Q))
text_file.close()
G = networkx.Graph(graph)
networkx.write_pajek(G, path = 'xxPajekPathxx', encoding='UTF-8')
networkx.write_gml(G, path = 'xxGmlPathxx')
networkx.write_edgelist(G, path = 'xxTabPathxx', delimiter = '\t')
Example #31
0
def phenograph(
    data: Union[np.ndarray, spmatrix],
    *,
    k: int = 30,
    directed: bool = False,
    prune: bool = False,
    min_cluster_size: int = 10,
    jaccard: bool = True,
    primary_metric: str = 'euclidean',
    n_jobs: int = -1,
    q_tol: float = 1e-3,
    louvain_time_limit: int = 2000,
    nn_method: str = 'kdtree',
) -> Tuple[np.ndarray, spmatrix, float]:
    """\
    PhenoGraph clustering [Levine15]_.

    Parameters
    ----------
    data
        Array of data to cluster or sparse matrix of k-nearest neighbor graph.
        If ndarray, n-by-d array of n cells in d dimensions,
        if sparse matrix, n-by-n adjacency matrix.
    k
        Number of nearest neighbors to use in first step of graph construction.
    directed
        Whether to use a symmetric (default) or asymmetric (“directed”) graph.
        The graph construction process produces a directed graph,
        which is symmetrized by one of two methods (see below).
    prune
        Whether to symmetrize by taking the average (`prune=False`) or product
        (`prune=True`) between the graph and its transpose.
    min_cluster_size
        Cells that end up in a cluster smaller than min_cluster_size are
        considered outliers and are assigned to -1 in the cluster labels.
    jaccard
        If `True`, use Jaccard metric between k-neighborhoods to build graph.
        If `False`, use a Gaussian kernel.
    primary_metric : {`'euclidean'`, `'manhattan'`, `'correlation'`, `'cosine'`}
        Distance metric to define nearest neighbors.
        Note that performance will be slower for correlation and cosine.
    n_jobs
        Nearest Neighbors and Jaccard coefficients will be computed in parallel
        using `n_jobs`. If `n_jobs=-1`, it is determined automatically.
    q_tol
        Tolerance (i.e., precision) for monitoring modularity optimization.
    louvain_time_limit
        Maximum number of seconds to run modularity optimization.
        If exceeded the best result so far is returned.
    nn_method : {`'kdtree'`, `'brute'`}
        Whether to use brute force or kdtree for nearest neighbor search.
        For very large high-dimensional data sets, brute force
        (with parallel computation) performs faster than kdtree.

    Returns
    -------
    communities : numpy.ndarray
        Integer array of community assignments for each row in data.
    graph : scipy.sparse.spmatrix
        The graph that was used for clustering.
    Q : float
        The modularity score for communities on graph.


    Example
    -------
    >>> from anndata import AnnData
    >>> import scanpy as sc
    >>> import scanpy.external as sce
    >>> import numpy as np
    >>> import pandas as pd

    Assume adata is your annotated data which has the normalized data.

    Then do PCA:

    >>> sc.tl.pca(adata, n_comps = 100)

    Compute phenograph clusters:

    >>> result = sce.tl.phenograph(adata.obsm['X_pca'], k = 30)

    Embed the phenograph result into adata as a *categorical* variable (this helps in plotting):

    >>> adata.obs['pheno'] = pd.Categorical(result[0])

    Check by typing "adata" and you should see under obs a key called 'pheno'.

    Now to show phenograph on tSNE (for example):

    Compute tSNE:

    >>> sc.tl.tsne(adata, random_state = 7)

    Plot phenograph clusters on tSNE:

    >>> sc.pl.tsne(adata, color = ['pheno'], s = 100, palette = sc.pl.palettes.vega_20_scanpy, legend_fontsize = 10)

    Cluster and cluster centroids for input Numpy ndarray

    >>> df = np.random.rand(1000,40)
    >>> df.shape
    (1000, 40)
    >>> result = sce.tl.phenograph(df, k=50)
    Finding 50 nearest neighbors using minkowski metric and 'auto' algorithm
    Neighbors computed in 0.16141605377197266 seconds
    Jaccard graph constructed in 0.7866239547729492 seconds
    Wrote graph to binary file in 0.42542195320129395 seconds
    Running Louvain modularity optimization
    After 1 runs, maximum modularity is Q = 0.223536
    After 2 runs, maximum modularity is Q = 0.235874
    Louvain completed 22 runs in 1.5609488487243652 seconds
    PhenoGraph complete in 2.9466471672058105 seconds

    New results can be pushed into adata object:

    >>> dframe = pd.DataFrame(data=df, columns=range(df.shape[1]),index=range(df.shape[0]) )
    >>> adata = AnnData( X=dframe, obs=dframe, var=dframe)
    >>> adata.obs['pheno'] = pd.Categorical(result[0])
    """
    start = logg.info('PhenoGraph clustering')

    try:
        import phenograph
    except ImportError:
        raise ImportError(
            'please install phenograph: '
            'pip3 install git+https://github.com/jacoblevine/phenograph.git')

    communities, graph, Q = phenograph.cluster(
        data=data,
        k=k,
        directed=directed,
        prune=prune,
        min_cluster_size=min_cluster_size,
        jaccard=jaccard,
        primary_metric=primary_metric,
        n_jobs=n_jobs,
        q_tol=q_tol,
        louvain_time_limit=louvain_time_limit,
        nn_method=nn_method,
    )

    logg.info('    finished', time=start)

    return communities, graph, Q
Example #32
0
def RUN_MAIN():

    # 1. Load gene expression matrix of simulated data
    # gene expression with simulated dropouts
    counts_drop = pd.read_csv('counts_1.csv', header=0, index_col=0)
    # ground trouth subpopulation assignment
    cellinfo = pd.read_csv('cellinfo_1.csv', header=0, index_col=0)

    group = cellinfo.Group
    label_ground_truth = []
    for g in group:
        g = int(g.split('Group')[1])
        label_ground_truth.append(g)

    # 2. Normalize gene expression based on scanpy (normalize each cell to have same library size)
    # matrix of cells x genes
    gene_expression = sc.AnnData(counts_drop.values)
    # normalize each cell to have same count number
    sc.pp.normalize_per_cell(gene_expression)
    # update datastructure to use normalized data
    gene_expression = gene_expression.X

    latent_dim = 50

    # 3. scScope learning
    if gene_expression.shape[0] >= 100000:
        DI_model = DeepImpute.train(gene_expression,
                                    latent_dim,
                                    T=2,
                                    batch_size=512,
                                    max_epoch=10,
                                    num_gpus=4)
    else:
        DI_model = DeepImpute.train(gene_expression,
                                    latent_dim,
                                    T=2,
                                    batch_size=64,
                                    max_epoch=300,
                                    num_gpus=4)

    # 4. latent representations and imputed expressions
    latent_code, imputed_val, _ = DeepImpute.predict(gene_expression, DI_model)

    # 5. graph clustering
    if latent_code.shape[0] <= 10000:
        label, _, _ = phenograph.cluster(latent_code)
    else:
        label = DeepImpute.scalable_cluster(latent_code)

    # evaluate
    ARI = adjusted_rand_score(label, label_ground_truth)
    print(ARI)

    X_embedded = TSNE(n_components=2).fit_transform(latent_code)

    # visualization of the subpopulation using tSNE
    plt.figure()
    for i in range(5):
        idx = np.nonzero(label == i)[0]
        plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1])
    plt.show()