Beispiel #1
0
def diffex_heatmap(expression,
                   genes,
                   clusters,
                   up,
                   ntop,
                   outdir,
                   label,
                   fdr_cutoff=0.01,
                   normed=False):
    """
    Generates gene expression heatmap of the top differentially specific
    genes for each cluster.

    Parameters
    ----------
    expression : DataFrame
        DataFrame of expression counts
    genes : DataFrame
        Two column DataFrame of gene names
    up : DataFrame
        The up DataFrame from binomial_test_cluster_vs_rest
    ntop : int
        The number of top genes to use to create the heatmap. If a gene was
        also a top gene for a previous cluster, the next highest effect size
        gene with a corrected p-value < fdr_cuttoff is used.
    outdir : str
        Output directy for pdf
    label : str
        Prefix to prepend to the filename
    fdr_cutoff : float, optional (Default: 0.01)
        Do not include genes in heatmap that are most differentially specific
        with fdr_bh corrected p values >= this value, even if they are in the
        top `ntop`.

    """
    nclusters = len(np.unique(clusters))
    # relable unclustered cells (assumed labeled with -1)
    replace_neg1_with = -1
    if -1 in np.unique(clusters):
        replace_neg1_with = np.max(clusters) + 1
        clusters = clusters.copy()
        clusters[clusters == -1] = replace_neg1_with

    expression = expression.set_index(genes.ens)

    # get top (significantly) differentially specific genes per cluster
    top_genes, top_gene_names = [], []
    for c in np.sort(np.unique(clusters)):
        if c == replace_neg1_with:
            c = -1
        my_diffex = up[up.cluster.str.split('.').str[-1].astype(int) ==
                       c].sort_values(by=['fdr', 'log2_effect'],
                                      ascending=[True, False])
        # filter gene names
        gene_name_mask = ~my_diffex.gene.str.contains('-')
        # only look at significant genes
        fdr_mask = my_diffex.fdr <= fdr_cutoff
        # don't add genes already on list
        unused_mask = ~my_diffex.ens.isin(top_genes)
        my_diffex = my_diffex[gene_name_mask & fdr_mask & unused_mask]
        top_genes.extend(my_diffex.head(ntop).ens.tolist())
        top_gene_names.extend(my_diffex.head(ntop).gene.tolist())

    # get cells with mergesort (a stable sort)
    cell_order = np.argsort(clusters, kind='mergesort')
    if not normed:  # if not already normalized, normalized expression
        expression = np.log2(expression / expression.sum(axis=0) * 1e4 + 1)

    diffex_matrix = expression.loc[top_genes][cell_order]
    diffex_matrix.index = top_gene_names

    # plot heatmap of gene expression for top_N genes ordered by cluster
    # assignment
    outfile = '{}/{}pg.diffex.pdf'.format(outdir,
                                          label + '.' if len(label) else '')
    mpl, plt, _ = _import_plotlibs()
    colors = get_cluster_cmap(nclusters, mpl)
    from matplotlib.backends.backend_pdf import PdfPages
    with PdfPages(outfile) as pdf:
        fig, ax = plt.subplots()
        L = float(diffex_matrix.shape[0]) / 100. * 15.
        fig.set_size_inches(20, L)
        heatmap = ax.pcolor(diffex_matrix.values, cmap='BuGn')
        fig = plt.gcf()
        ax = plt.gca()
        ax.set_yticks(np.arange(diffex_matrix.shape[0]) + 0.5, minor=False)
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=9)
        labels = diffex_matrix.index.tolist()
        ax.set_yticklabels(labels, minor=False)
        pdf.savefig()
        plt.close()
        fig, ax = plt.subplots()
        fig.set_size_inches(20, 1)
        cMap = mpl.colors.ListedColormap(colors)
        clusterids = clusters[cell_order]

        # clusterids = [0 for pt in range(clusters.count(0))]
        # for i in range(1,Nclusters):
        # clusterids.extend([i for pt in range(clusters.count(i))])
        heatmap = ax.pcolor([clusterids, clusterids], cmap=cMap)
        fig = plt.gcf()
        ax = plt.gca()
        pdf.savefig()
        plt.close()
Beispiel #2
0
def select_markers(counts,
                   window=25,
                   nstd=6,
                   t=0.15,
                   outdir='',
                   prefix='',
                   gene_names=None):
    """ Select marker with rolling window and scaling

    Procedure used in Levitin et al. 2019 and Szabo, Levitin et al.  2019.
    For selection method used in Yuan et al. 2018 and Mizrak et al. 2019, see
    `select_markers_static_bins`

    Parameters
    ----------
    counts : ndarray
        gene x cell count matrix
    window : int, (default 25)
        size of window centered at each gene
    nstd : float, (default 6)
        number of standard deviations from the mean to set an adaptive
        dropout threshold.  To force use of a hard threshold, set to
        something really high.
    t : float (default 0.15)
        maximum threshold for designation as a dropout gene
    verbose : bool (default True)
        verbose output
    outdir: str, optional (Default: '')
        If given, directory to save markers and plots to
    prefix: str, optional (Default: '')
        If given, prefix for save filenames
    gene_names : pandas dataframe, optional
        ordered gene names and any other info. must have integer indices.  If
        given, Used to write a file with marker gene names in addition to
        indices.

    Returns
    -------
    ix_passing : ndarray
        indices of selected genes


    TODO: refactor w/ select_markers_static_bins_unscaled to avoid repeated code
    """
    print("Found {} genes (with a nonzero count) in {} cells...".format(
        counts.shape[0], counts.shape[1]))
    print("Calculating dropout scores...")
    dropout, means, scores = _dropout_scores(counts, window)

    adaptive_threshold = nstd * np.std(scores) + np.mean(scores)
    threshold = min(adaptive_threshold, t)
    if threshold == adaptive_threshold:
        msg = 'Using adaptive threshold {adaptive_threshold}'
        msg += ' over absolute threshold {t}'
    else:
        msg = 'Using absolute threshold {t}'
        msg += ' over adaptive threshold {adaptive_threshold}'
    print(msg.format(adaptive_threshold=adaptive_threshold, t=t))
    ix_passing = np.where(scores > threshold)[0]

    n_markers = len(ix_passing)
    print('Found {} markers from dropout analysis'.format(n_markers))

    # write things to file
    if outdir is not None and len(outdir) > 0:
        # record parameters and adaptive threshold
        my_prefix = prefix.rstrip('.') + '.' if len(prefix) else ''
        print('Writing threshold info...')
        thresholdfile = '{}/{}dropout_threshold.txt'.format(outdir, my_prefix)
        with open(thresholdfile, 'w') as f:
            msg = 'nstdev: {}\nadaptive:{}\nt: {}\n'.format(
                nstd, adaptive_threshold, t)
            f.write(msg)

        # save marker indexes
        print('Saving marker gene indexes...')
        ixfile = '{}/{}marker_ix.txt'.format(outdir, my_prefix)
        np.savetxt(ixfile, ix_passing, fmt='%i')

        # save marker gene names if gene_names given
        if gene_names is not None:
            print('Saving marker gene names...')
            markerfile = '{}/{}markers.txt'.format(outdir, my_prefix)
            passing_names = gene_names.iloc[ix_passing]
            passing_names.to_csv(markerfile, sep='\t', header=None, index=None)

        # plot the dropout curve
        print('Plotting dropout curve')
        # annoying import trickery to avoid exceptions due to matplotlib's
        # backend in different contexts
        mpl, plt, sns = _import_plotlibs()
        from matplotlib.backends.backend_pdf import PdfPages
        pdffile = '{}/{}dropout_curve.pdf'.format(outdir, my_prefix)
        with PdfPages(pdffile) as pdf:
            plt.plot(means,
                     dropout,
                     'ko',
                     means[ix_passing],
                     dropout[ix_passing],
                     'go',
                     markersize=4)
            plt.ylim([-0.05, 1.05])
            plt.xlabel('log10(Mean Normalized Counts)')
            plt.ylabel('Fraction of Cells')
            pdf.savefig()
            plt.close()

    return ix_passing