Exemple #1
0
def clustercells(adata, outdir, group_keys):
    # open pdf file for plots
    pdf = pdf.PdfPages(outdir + "/clustering.pdf")
    plt.figure(figsize=(10, 5))

    # do the clustering
    sc.tl.pca(adata)
    sc.pp.neighbors(adata)
    sc.tl.louvain(adata)

    # plot louvain on pca
    fig1 = sc.pl.pca(adata, color="louvain", return_fig=True)
    fig2 = sc.pl.pca_variance_ratio(adata)

    # plot louvain on umap
    sc.tl.umap(adata, min_dist=0.1, spread=5)
    fig3 = sc.pl.umap(adata, color="louvain", return_fig=True, palette='tab20')

    # plot sample on umap
    fig4 = sc.pl.umap(adata, color='sample', return_fig=True, palette='tab20')

    # plot also on groups if indicated
    if group_keys is not None:
        for key in group_keys:
            fig4 = sc.pl.pca(adata, color=key, return_fig=True)
            sc.tl.umap(adata, min_dist=0.1, spread=5)
            fig5 = sc.pl.umap(adata,
                              color=key,
                              return_fig=True,
                              palette='tab20')

    adata.obs.to_csv(outdir + "/barcode_cluster.tsv", sep="\t")

    # store genes
    adata.var.to_csv(outdir + "/var_cluster.tsv", sep="\t")

    # store adata object
    scipy.io.mmwrite(outdir + "/cluster", adata.X)

    ## write the filtered anndata
    adata.write_loom(outdir + "/anndata.loom", write_obsm_varm=True)

    # save figures and close pdf
    pdf.savefig(fig1)
    pdf.savefig(fig2)
    pdf.savefig(fig3)
    if group_keys is not None:
        pdf.savefig(fig4)
        pdf.savefig(fig5)
    pdf.close()

    return (adata)
Exemple #2
0
def preprocess(adata, outdir, highlyvar):
    # open pdf file for plots
    pdf = pdf.PdfPages(outdir + "/preprocess_stats.pdf")
    plt.figure(figsize=(10, 5))

    # variables might be nice to expose these variables
    min_counts_cell = 1000
    min_genes_cell = 200
    min_cells_gene = 3

    print('\nMinumum counts per cell: {}'.format(min_counts_cell))
    print('Minimum genes per cell: {}'.format(min_genes_cell))
    print('Minimum cells expressing a gene: {}'.format(min_cells_gene))

    samples = set(adata.obs['sample'].tolist())

    # make figure for basic filtering
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs['n_genes'] = (adata.X > 0).sum(axis=1).A1

    # plot total counts per cell histogram
    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
    colors = ['b', 'g', 'm', 'c', 'y']
    max_counts = max(adata.obs['total_counts'].tolist())
    xmin = 8e-1
    xmax = max_counts
    y0min = 8e-1
    y0max = max(adata.obs['n_genes'].tolist())
    y1min = 8e-1
    y1max = max_counts

    for i, sample in enumerate(samples):
        # setup in loop variables
        bc = adata.obs.index[adata.obs['sample'] == sample].tolist()
        color = colors[i % 5]

        # plot total counts against number of genes per barcode
        x = adata.obs.loc[bc, 'total_counts'].tolist()
        y = adata.obs.loc[bc, 'n_genes'].tolist()
        axs[0].scatter(x, y, c=color, s=2)

        # plot total counts per barcode
        x = range(len(bc))
        y = adata.obs.loc[bc, 'total_counts'].sort_values(
            ascending=False).tolist()
        axs[1].plot(x, y, c=color)

        # plot total counts as histogram
        data = adata.obs.loc[bc, 'total_counts'].tolist()
        axs[2].hist(data, bins=200, color=color)

    axs[0].plot([min_counts_cell, min_counts_cell], [y0min, y0max],
                linewidth=1,
                linestyle='--',
                color='r')
    axs[0].plot([xmin, xmax], [min_genes_cell, min_genes_cell],
                linewidth=1,
                linestyle='--',
                color='r')
    axs[0].set_xlabel('total counts')
    axs[0].set_ylabel('number of genes')
    axs[0].set_ylim([y0min, y0max])
    axs[0].set_xlim([xmin, xmax])
    axs[0].set_yscale('log')
    axs[0].set_xscale('log')
    axs[0].set_title('Total counts and number of genes correlation')

    axs[1].plot([xmin, xmax], [min_counts_cell, min_counts_cell],
                linewidth=1,
                linestyle='--',
                color='r')
    axs[1].set_xlabel('barcodes')
    axs[1].set_ylabel('total counts')
    axs[1].set_ylim([y1min, y1max])
    axs[1].set_xlim([xmin, xmax])
    axs[1].set_yscale('log')
    axs[1].set_xscale('log')
    axs[1].set_title('Total counts sorted')

    # axs[2].set_ylim([1e0, 5e3]); axs[2].set_xlim([1e0, 5e4])
    y2min, y2max = axs[2].get_ylim()
    axs[2].plot([min_counts_cell, min_counts_cell], [y2min, y2max],
                linewidth=1,
                linestyle='--',
                color='r')
    axs[2].set_xlabel('Total counts')
    axs[2].set_ylabel('Number of barcodes')
    axs[2].set_yscale('log')
    axs[2].set_xscale('linear')
    axs[2].set_title('Total counts sorted')

    # save the figure to preprocess_stats.pdf
    pdf.savefig(fig)

    #filter minimum number of genes per cell
    print('\nAdata dimensions before filtering cells:\n{}'.format(adata.shape))
    if adata.shape[1] <= min_genes_cell:
        print('WARNING: few genes only {}'.format(adata.shape[1]))
        sc.pp.filter_cells(adata, min_genes=math.floor(adata.shape[1] / 2))
    else:
        sc.pp.filter_cells(adata, min_genes=min_genes_cell)
    # filter minimum number of counts per cell, check if amount of cells with at least
    if math.ceil(np.quantile(adata.obs['total_counts'].values,
                             .90)) <= min_counts_cell:
        # if 10% or less of cells are kept at threshold set threshold to keep 50%
        print(
            'WARNING: few counts per cell keep top 50% with cut off at: {} counts'
            .format(math.ceil(np.quantile(qc[0]['total_counts'].values, .50))))
        sc.pp.filter_cells(adata,
                           min_counts=math.ceil(
                               np.quantile(adata.obs['total_counts'].values,
                                           .50)))
    else:
        sc.pp.filter_cells(adata, min_counts=min_counts_cell)
    print('\nAdata dimensions after filtering cells:\n{}'.format(adata.shape))

    for sample in samples:
        bc = adata.obs.index[adata.obs['sample'] == sample].tolist()
        print('\nMedian counts sample {} per cell: {}'.format(
            sample, adata.obs.loc[bc, 'total_counts'].sort_values(
                ascending=False).median()))

    # plot counts per gene
    fig, axs = plt.subplots(1, 2, figsize=(20, 8))

    # plot total counts against number of barcodes per gene
    x = adata.var['total_counts'].tolist()
    y = adata.var['n_cells_by_counts'].tolist()
    axs[0].scatter(x, y, c='g', s=2)
    xmin, xmax = axs[0].get_xlim()
    ymin, ymax = axs[0].get_ylim()
    # plot from below 1, 0 not possible due to log axis
    xmin = 8e-1
    ymin = 8e-1
    axs[0].set_ylim([ymin, ymax])
    axs[0].set_xlim([xmin, xmax])
    axs[0].plot([xmin, xmax], [min_cells_gene, min_cells_gene],
                linewidth=1,
                linestyle='--',
                color='r')
    axs[0].set_xlabel('total counts')
    axs[0].set_ylabel('number of cells')
    axs[0].set_yscale('log')
    axs[0].set_xscale('log')
    axs[0].set_title('Total counts and number of cells correlation')

    # plot counts per gene normalized to precence in number of cells
    adata.var['mean_count_per_expressing_cell'] = adata.var[
        'total_counts'] / adata.var['n_cells_by_counts']
    axs[1].hist(adata.var['mean_count_per_expressing_cell'].sort_values(
        ascending=False),
                bins=50,
                color='g')
    axs[1].set_xlabel('Mean counts per expressing cell')
    axs[1].set_ylabel('Number of genes')
    axs[1].set_yscale('log')
    axs[1].set_xscale('linear')
    axs[1].set_title('Mean counts per gene per cell')

    # save the figure to preprocess_stats.pdf
    pdf.savefig(fig)

    # filter min of cells per gene
    sc.pp.filter_genes(adata, min_cells=min_cells_gene)
    print('\nAdata dimensions after filtering genes:\n{}'.format(adata.shape))

    # normalize and log transform (should we use normalize per cell?)
    ntotal = math.ceil(
        np.quantile(adata.obs['total_counts'].values, .90) /
        100) * 100  #round 90th quantile up to the nearest 100
    sc.pp.normalize_total(adata,
                          target_sum=ntotal,
                          exclude_highly_expressed=True,
                          inplace=True)
    print('\nNormalize to value: {}'.format(ntotal))
    sc.pp.log1p(adata)

    if highlyvar > 0 and highlyvar < adata.shape[1]:
        print('\nSelection of highly variable genes')
        # set raw data before highly variable genes selection
        adata.raw = adata
        sc.pp.highly_variable_genes(adata,
                                    n_top_genes=highlyvar,
                                    inplace=True,
                                    subset=True)
        sc.pp.regress_out(adata, ['n_counts'])
        sc.pp.scale(adata, max_value=10)
        print(
            "Adata dimensions after selection highly variable genes/TCCs: {}".
            format(adata.shape))
    else:
        print('No selection for highly variable genes')

    # close preprocess_stats.pdf
    pdf.close()

    return (adata)
Exemple #3
0
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf as pdf
import numpy


def plot_logistic_map(r: float, x: float, iterations: int):
    iterations_list = []
    results_list = []
    for i in range(iterations):
        x = r * (x - x ** 2)
        results_list.append(x)
        iterations_list.append(i)

    plt.xlabel("Iterations")
    plt.ylabel(f"R = {r}")
    plt.plot(iterations_list, results_list)

    return plt


if __name__ == "__main__":
    pdf = pdf.PdfPages("logistic_map_output.pdf")
    for i in numpy.arange(0.1, 5.0, 0.1):
        pdf.savefig(plot_logistic_map(i, .02, 30).gcf())
        plt.clf()
    pdf.close()
Exemple #4
0
axis6 = fig.add_subplot(515)
job_mem = indiv_pass_mem + indiv_fail_mem
job_type2 = ['Passed chunks'] * len(indiv_pass_mem) + ['Failed chunks'
                                                       ] * len(indiv_fail_mem)
df2 = pd.DataFrame(list(zip(job_mem, job_type2)),
                   columns=['Individual_process_mem(GB)', 'Process_status'])
my_pal2 = {"Passed chunks": "g", "Failed chunks": "r"}
ax2 = sns.violinplot(x="Process_status",
                     y="Individual_process_mem(GB)",
                     data=df2,
                     palette=my_pal2)
ax2.set_title('Individual chunk processed mem', fontsize=14)
# Calculate number of obs per group & median to position labels
medians2 = df2.groupby(['Process_status'
                        ])['Individual_process_mem(GB)'].median().values
nobs2 = df2['Process_status'].value_counts().values
nobs2 = [str(x) for x in nobs2.tolist()]
nobs2 = ["n=" + i for i in nobs2]
# Add it to the plot
pos2 = range(len(nobs2))
for tick, label in zip(pos2, ax2.get_xticklabels()):
    ax2.text(pos[tick],
             medians[tick] + 0.03,
             nobs2[tick],
             horizontalalignment='center',
             size='medium',
             color='black',
             weight='semibold')

pdf.savefig(fig)
pdf.close()
import matplotlib.pyplot as plt
from math import sin, pi
import matplotlib.backends.backend_pdf as pdf
import numpy


def plot_sin_map(r: float, x: float, iterations: int):
    iterations_list = []
    results_list = []
    for i in range(iterations):
        x = r / 4 * sin(pi * x)
        results_list.append(x)
        iterations_list.append(i)

    plt.xlabel("Iterations")
    plt.ylabel(f"R = {r}")
    plt.plot(iterations_list, results_list)
    return plt


if __name__ == "__main__":

    pdf = pdf.PdfPages("sin_map_output.pdf")
    for i in numpy.arange(0.1, 5.0, 0.1):
        pdf.savefig(plot_sin_map(i, .02, 30).gcf())
        plt.clf()
    pdf.close()