Ejemplo n.º 1
0
def make_rseqc_summary_plots(rd_file, gc_file, do_qc=True, min_exonmap=60.0, max_three_prime_map=10.0):
    """Make rseqc summary plots"""
    df_rd = pd.read_csv(rd_file, index_col=0)
    df_gc = pd.read_csv(gc_file, index_col=0)
    samples = list(df_gc.index)
    # Use tags for formula 
    df = df_rd.pivot_table(columns=["Group"], values=["Tag_count"], index="sample")
    df['Tag_count', "ExonMap"] = 100.0 * (df['Tag_count', "CDS_Exons"] + df['Tag_count', "3'UTR_Exons"] + df['Tag_count', "5'UTR_Exons"]) / df['Tag_count', "Total_Assigned_Tags"]

    df.columns = df.columns.droplevel()
    df['i'] = list(range(0, len(df.index)))
    df['samples'] = samples
    df_gc["three_prime_map"] = 100.0 * df_gc.loc[:, "91":"100"].sum(axis=1) / df_gc.loc[:, "1":"100"].sum(axis=1)
    df = pd.concat([df, df_gc], axis=1)

    colors = brewer["PiYG"][3]
    colormap = {'False' : colors[0], 'True' : colors[2]}
    columns = [
        TableColumn(field="samples", title="Sample"),
        TableColumn(field="ExonMap", title="Tags mapping to exons (%)"),
        TableColumn(field="3' Map", title="Tags mapping to 3' end (%)"),
    ]
    source = ColumnDataSource(df)

    # Default tools, plot_config and tooltips
    TOOLS="pan,box_zoom,box_select,lasso_select,reset,save,hover"
    plot_config=dict(plot_width=300, plot_height=300, 
                     tools=TOOLS, title_text_font_size='12pt',
                     x_range=[0, len(samples)], y_range=[0, 105],
                     x_axis_type=None, y_axis_type="linear", 
                     xaxis={'axis_label' : "sample", 'major_label_orientation' : np.pi/3, 'axis_label_text_font_size' : '10pt'}, 
                     yaxis={'axis_label' : "percent (%)", 'major_label_orientation' : 1, 'axis_label_text_font_size' : '10pt'})

    # Exonmap plot
    qc = QCArgs(x=[0,len(samples)], 
                y=[min_exonmap, min_exonmap], 
                line_dash=[2,4]) if do_qc else None
    c1 = list(map(lambda x: colormap[str(x)], 
                  df['ExonMap'] < min_exonmap)) if do_qc else colors[0]
    p1 = scatterplot(x='i', y='ExonMap', 
                     source=source, color=c1, qc=qc, 
                     tooltips = [{'type':HoverTool, 'tips' : [
                         ('Sample', '@samples'),('ExonMap', '@ExonMap'),]}], 
                     title="Tags mapping to exons", **plot_config)
    # Fraction reads mapping to the 10% right-most end
    qc = QCArgs(x=[0,len(samples)], 
                y=[max_three_prime_map, max_three_prime_map], 
                line_dash=[2,4]) if do_qc else None
    c2 = list(map(lambda x: colormap[str(x)], 
                  df['three_prime_map'] > max_three_prime_map)) if do_qc else colors[0]
    p2 = scatterplot(x = 'i', y = 'three_prime_map', 
                     color = c2, source = source, 
                     qc=qc,
                     tooltips = [{'type':HoverTool, 'tips' : [
                         ('Sample', '@samples'),('ExonMap', '@ExonMap'),]}], 
                     title="Reads mapping to 3' end", **plot_config)

    return {'fig' : gridplot([[p1, p2]]),
            'uri' : [data_uri(rd_file), data_uri(gc_file)],
            'file' : [rd_file, gc_file]}
Ejemplo n.º 2
0
def make_qualimap_plots(qmglobals=None, coverage_per_contig=None):
    """Make qualimap summary plots"""
    retval = {
        "fig": {"coverage_per_contig": None, "globals": None},
        "file": {"coverage_per_contig": coverage_per_contig, "globals": qmglobals},
        "uri": {"coverage_per_contig": data_uri(coverage_per_contig), "globals": data_uri(qmglobals)},
    }
    # Globals
    if qmglobals is not None:
        df_all = pd.read_csv(qmglobals)
        df_all["Sample"] = df_all["Sample"].astype("str")
        fig = figure(
            y_range=[0, max(df_all["number of reads"])],
            title="Mapping summary",
            title_text_font_size="12pt",
            plot_width=400,
            plot_height=400,
            x_range=sorted(list(set(df_all["Sample"]))),
        )
        mdotplot(
            fig,
            x="Sample",
            size=10,
            df=df_all,
            alpha=0.5,
            y=["number of reads", "number of mapped reads", "number of duplicated reads", "number of unique reads"],
        )
        xaxis(fig, axis_label="sample", major_label_orientation=np.pi / 3, axis_label_text_font_size="10pt")
        yaxis(fig, axis_label="count", major_label_orientation=1, axis_label_text_font_size="10pt")
        retval["fig"]["globals"] = fig

    # Coverage per contig
    if coverage_per_contig is not None:
        df_all = pd.read_csv(coverage_per_contig, index_col=0)
        df_all["Sample"] = df_all["Sample"].astype("str")
        fig = figure(width=300, height=300)
        points(
            fig, x="chrlen_percent", y="mapped_bases_percent", df=df_all, glyph="text", text="chr", text_font_size="8pt"
        )
        main(fig, title_text_font_size="8pt")
        xaxis(fig, axis_label="Chromosome length of total (%)", axis_label_text_font_size="8pt")
        yaxis(fig, axis_label="Mapped bases of total (%)", axis_label_text_font_size="8pt")

        gp = facet_grid(
            fig,
            x="chrlen_percent",
            y="mapped_bases_percent",
            df=df_all,
            groups=["Sample"],
            width=300,
            height=300,
            share_x_range=True,
            share_y_range=True,
            title_text_font_size="12pt",
        )
        for fig in [item for sublist in gp.children for item in sublist]:
            abline(fig, x="chrlen_percent", y="mapped_bases_percent", df=df_all, slope=1)
        retval["fig"]["coverage_per_contig"] = gp
    return retval
Ejemplo n.º 3
0
def atacseq_summary(config, input, output, Cutadapt, Qualimap,
                    MarkDuplicates, InsertMetrics, AlignmentMetrics):
    """Make atacseq summary"""
    # Dictionary of figures
    d = {}
    Cutadapt.read_aggregate_data()
    d.update({'cutadapt': {'fig': Cutadapt.plot('cutadapt')[0]}})

    Qualimap.read_aggregate_data()
    d.update({'qualimap': {'fig': {'globals': Qualimap.plot("Globals")[0],
                                   'coverage_per_contig': Qualimap.plot("Coverage_per_contig")[0]}}})
    
    InsertMetrics.read_aggregate_data()
    plist = [InsertMetrics.plot('metrics')[0]] + [x for sublist in InsertMetrics.plot("hist")[0].children for x in sublist]
    gp = gridplot([plist[i:i+3] for i in range(0, len(plist), 3)])
    d.update({'picard': {'InsertMetrics': {'atacseq' : {'fig': gp}}}})

    MarkDuplicates.read_aggregate_data()
    d['picard'].update({'DuplicationMetrics': {'atacseq':
                                               {'fig':
                                                gridplot([[MarkDuplicates.plot('metrics')[0],
                                                           MarkDuplicates.plot('hist')[0]]])}}})

    d.update({'rulegraph' : {'fig' : input.rulegraph, 'uri': data_uri(input.rulegraph),
                             'target' : 'atacseq_all'}})

    # Write the resulting html
    tp = Env.get_template('workflow_atacseq_qc.html')
    with open(output.html, "w") as fh:
        fh.write(static_html(tp, template_variables=d, css_raw=css_files))
Ejemplo n.º 4
0
def scrnaseq_qc(config, input, output, results=None, rsem=None, rpkmforgenes=None, **kwargs):
    """Do QC of scrnaseq"""
    # Collect results - subset by samples if applicable
    results.read_aggregate_data()
    results.filter_aggregate_data('alignrseqc', 'SM', config["samples"])

    taptool_url = None
    if not config['scrnaseq.workflow']['report']['annotation_url'] is None:
        taptool_url = config['scrnaseq.workflow']['report']['annotation_url'] + "@gene_id"
        
    # Brennecke args
    # brennecke_args = {'plot_height':600, 'plot_width': 800, 'alpha':
    #                   0.3, 'taptool_url': config['scrnaseq.workflow']['report']['annotation_url'] + "@gene_id"}
    # Alignment stats
    d = {'align': {'fig': results.plot('alignrseqc')[0]['fig'], 'table': results.plot('alignrseqc')[0]['table']}}
    # rsem plots
    if rsem.run:
        d.update({'rsem': plot_pca(rsem.targets['pca'][0],
                                   config['scrnaseq.workflow']['metadata'],
                                   rsem.targets['pca'][0].replace(".pca.csv", ".pcaobj.pickle"),
                                   taptool_url= taptool_url)})
    #     # FIXME: Instead of re use list
    #     # d['rsem'].update({'brennecke': scrnaseq_brennecke_plot(infile=input.rsemgenes, spikein_re=re.compile("^ERCC"),
    #     #                                                        index=["SM", "gene_id", "transcript_id(s)", "gene_name"],
    #     #                                                        **brennecke_args)})

    # rpkmforgenes plots
    if rpkmforgenes.run:
        d.update({'rpkmforgenes': plot_pca(rpkmforgenes.targets['pca'][0],
                                           config['scrnaseq.workflow']['metadata'],
                                           rpkmforgenes.targets['pca'][0].replace(".pca.csv", ".pcaobj.pickle"),
                                           taptool_url= taptool_url)})


    d.update({'version' : all_versions(), 'config' : {'uri' : data_uri(input.globalconf), 'file' : input.globalconf}})
    d.update({'rulegraph': {'fig': input.rulegraph, 'uri': data_uri(input.rulegraph),
                            'target': 'scrnaseq_all'}})
    tp = Env.get_template('workflow_scrnaseq_qc.html')
    with open(output.html, "w") as fh:
        fh.write(static_html(tp, template_variables=d, css_raw=bootstrap_css_files, js_raw=bootstrap_js_files))
Ejemplo n.º 5
0
def make_picard_summary_plots(inputfiles, ncol=4):
    d = {}
    TOOLS = "pan,box_zoom,wheel_zoom,box_select,lasso_select,resize,reset,save,hover"
    for (metrics_file, hist_file) in zip(inputfiles[0::2], inputfiles[1::2]):
        df_met = _read_metrics(metrics_file)
        df_hist = _read_metrics(hist_file)
        p1 = df_met.plot_metrics(tools=TOOLS)
        key = os.path.splitext(metrics_file)[0]
        if df_met.label not in d:
            d[df_met.label] = {}
            d[df_met.label][key] = {}
            d[df_met.label][key]['uri'] = [data_uri(metrics_file)]
            d[df_met.label][key]['file'] = [metrics_file]
        if df_hist is not None:
            p2 = df_hist.plot_hist(tools=TOOLS)
            d[df_met.label][key]['uri'].append(data_uri(hist_file))
            d[df_met.label][key]['file'].append(hist_file)
        else:
            p2 = []
        plist = p1 + p2
        gp = gridplot([plist[i:i+ncol] for i in range(0, len(plist), ncol)])
        d[df_met.label][key]['fig'] = gp
    return d
Ejemplo n.º 6
0
def make_cutadapt_summary_plot(inputfile):
    df_summary = pd.read_csv(inputfile)
    df_summary["sample"] = df_summary["sample"].astype("str")
    TOOLS = "pan,wheel_zoom,box_zoom,box_select,reset,save"
    fig = figure(tools=TOOLS, width=400, height=400,
                 x_range=sorted(list(set(df_summary["sample"]))),
                 y_range=[0, 105], title="Cutadapt metrics",
                 title_text_font_size='12pt')
    mdotplot(fig, x="sample", y=["read1_pct", "read2_pct"],
             df=df_summary, size=10, alpha=0.5)
    xaxis(fig, axis_label="sample",
          major_label_orientation=np.pi/3,
          axis_label_text_font_size='10pt')
    yaxis(fig, axis_label="percent reads",
          major_label_orientation=1,
          axis_label_text_font_size='10pt')
    return {'fig': fig, 'uri': data_uri(inputfile), 'file': inputfile}
Ejemplo n.º 7
0
def make_cutadapt_summary_plot(inputfile):
    df_summary = pd.read_csv(inputfile)
    df_summary["read1_pct"] = 100.0 * df_summary["Read 1 with adapter"]/df_summary["Total read pairs processed"]
    df_summary["read2_pct"] = 100.0 * df_summary["Read 2 with adapter"]/df_summary["Total read pairs processed"]

    TOOLS="pan,box_zoom,box_select,lasso_select,reset,save,hover"
    fig = dotplot(y=["read1_pct", "read2_pct"], df = df_summary,
                  groups=["sample", "run"], 
                  tooltips = [{'type':HoverTool, 'tips' :
                               [('Sample', '@sample'),]}],
                  plot_width=600, plot_height=600, tools=TOOLS, 
                  title="Cutadapt metrics", 
                  title_text_font_size='12pt', 
                  xaxis={'axis_label' : "sample", 'major_label_orientation' : np.pi/3, 'axis_label_text_font_size' : '10pt'}, 
                  yaxis={'axis_label' : "percent reads", 'major_label_orientation' : 1, 'axis_label_text_font_size' : '10pt'}, 
                  x_axis_type=None, y_axis_type="linear", 
                  y_range=[0, 105], size=3)
    return {'fig' : fig, 'uri' : data_uri(inputfile), 'file' : inputfile}
def get_sphinx_report(config):
    comps = config["comparisons"]
    git_commit_string = "XXXXXX"
    git_link = 'https://bitbucket.org/cfce/viper/commits/'
    #Check for .git directory
    if os.path.exists("viper/.git"):
        git_commit_string = subprocess.check_output(
            'git --git-dir="viper/.git" rev-parse --short HEAD',
            shell=True).decode('utf-8').strip()
        git_link = 'https://bitbucket.org/cfce/viper/commits/' + git_commit_string
    file_dict = {
        'align_report':
        "analysis/" + config["token"] + "/STAR/STAR_Align_Report.png",
        'rRNA_report':
        "analysis/" + config["token"] +
        "/STAR_rRNA/STAR_rRNA_Align_Report.png",
        'read_distrib':
        "analysis/" + config["token"] + "/RSeQC/read_distrib/read_distrib.png",
        'gb_cov_heatmap':
        "analysis/" + config["token"] +
        "/RSeQC/gene_body_cvg/geneBodyCoverage.heatMap.png",
        'gb_cov_curves':
        "analysis/" + config["token"] +
        "/RSeQC/gene_body_cvg/geneBodyCoverage.curves.png",
        'heatmapSF_plot':
        "analysis/" + config["token"] + "/plots/images/heatmapSF_plot.png",
        'heatmapSS_plot':
        "analysis/" + config["token"] + "/plots/images/heatmapSS_plot.png",
        'heatmapSS_cluster':
        "analysis/" + config["token"] + "/plots/images/heatmapSS_cluster.png",
        'DEsummary_plot':
        "analysis/" + config["token"] + "/diffexp/de_summary.png",
        'SNP_chr6':
        "analysis/" + config["token"] + "/plots/sampleSNPcorr_plot.chr6.png",
        'SNP_HLA':
        "analysis/" + config["token"] + "/plots/sampleSNPcorr_plot.hla.png",
        'SNP_genome':
        "analysis/" + config["token"] + "/plots/sampleSNPcorr_plot.genome.png",
        'FUSION_OUT':
        "analysis/" + config["token"] + "/STAR_Fusion/STAR_Fusion_Report.png"
    }
    copy_file_dict = {}
    for key in file_dict.keys():
        copy_file_dict[key] = file_dict[key]
    for file_token in file_dict.keys():
        if not os.path.isfile(file_dict[file_token]):
            del copy_file_dict[file_token]
        else:
            copy_file_dict[file_token] = data_uri(copy_file_dict[file_token])
    file_dict = copy_file_dict
    pca_png_list = []
    volcano_list = []
    SF_png_list = []
    gsea_list = []
    virusseq_out = "analysis/" + config[
        "token"] + "/virusseq/virusseq_summary.csv"
    cdr_cpk_plot = "analysis/cdr3/CPK.png"

    for pca_plot in sorted(
            glob.glob("./analysis/" + config["token"] +
                      "/plots/images/pca_plot*.png")):
        if "pca_plot_scree.png" not in pca_plot:
            pca_png_list.append(data_uri(pca_plot))

    if (os.path.isfile("./analysis/" + config["token"] +
                       "/plots/images/pca_plot_scree.png")):
        pca_png_list.append(
            data_uri("./analysis/" + config["token"] +
                     "/plots/images/pca_plot_scree.png"))

    for volcano_plot in glob.glob("./analysis/" + config["token"] +
                                  "/plots/images/*_volcano.png"):
        volcano_list.append(data_uri(volcano_plot))

    for SF_plot in sorted(
            glob.glob("./analysis/" + config["token"] +
                      "/plots/images/heatmapSF_*_plot.png")):
        SF_png_list.append(data_uri(SF_plot))

    for comp in comps:
        tmp_f = "./analysis/%s/gsea/%s/%s.gene_set.enrichment.dotplot.png" % (
            config["token"], comp, comp)
        if (os.path.isfile(tmp_f)):
            gsea_list.append(data_uri(tmp_f))

    if pca_png_list:
        file_dict['pca_png_list'] = pca_png_list
    if volcano_list:
        file_dict['volcano_png_list'] = volcano_list
    if SF_png_list:
        file_dict['sf_png_list'] = SF_png_list
    if gsea_list:
        file_dict['gsea_png_list'] = gsea_list
    report = """
==========================================================================================
VIPER: Visualization Pipeline for RNAseq - {sub_analysis_token}
==========================================================================================


Alignment Summary
=================
    Raw reads were mapped or aligned to the reference organism using `STAR software`_.

    .. _STAR software: https://github.com/alexdobin/STAR


    The **uniquely mapped read counts** and the **total read counts** for all the samples are summarized in the following image. In most cases, more than 70% of the reads should uniquely map to the reference genome.
    Contamination, low quality sequencing data, or poor library contruction may result in <60% uniquely aligned reads.

""".format(sub_analysis_token=config["token"])
    if 'align_report' in file_dict:
        report += "\n\t.. image:: " + file_dict['align_report'] + "\n"

    report += "\n"

    report += """
Library Prep Quality Metrics
=============================
Read Distribution QC
^^^^^^^^^^^^^^^^^^^^^^^^^^
    This graph displays the disibution of reads mapped to **features** across the genome for each sample. Distribution profiles should be similar across samples.
    A sample with a siginficantly different distribution profile may indicate that it was initially a poor sample or that there was a problem during library preparation.
    **mRNAseq libraries will typically contain less than 20% intronic mapped reads** whereas **totalRNAseq libraries may have greater than 40% intronic mapped reads**.
"""

    if 'read_distrib' in file_dict:
        report += "\n\t.. image:: " + file_dict['read_distrib'] + "\n"

    report += "\n"
    report += """
rRNA removal QC
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This graph displays the percentage of reads mapping to ribosomal RNA reference sequences. Most RNAseq library prep methods are designed to avoid sampling ribosomal RNAs which typically represent greater than 80% of total RNA. If rRNA removal was effective, less than 5% of the reads should map to rRNA sequences and for mRNA libraries fewer than 1%.
"""

    if 'rRNA_report' in file_dict:
        report += "\n\t.. image:: " + file_dict['rRNA_report'] + "\n"

    report += "\n"
    report += """
Genebody Coverage
^^^^^^^^^^^^^^^^^
    For accurate gene expression quantification, mapped reads should be evenly distributed across genebodies.
    Significantly skewed profiles (5' or 3') may introduce quantification bias and/or represent poor quality library preparation.\n
    For example, mRNAseq library preps typically use oligo-dT beads to capture mature transcripts and can be prone to 3' bias in genebody coverage if degraded RNA \(RIN < 7\) is used as input. This may result in inaccurate gene quantification and the following graphs will help diagnose.
    There are other prep methods that may result in 5' bias too. Ideally, coverage should be uniform across the genebody. The line plots should look like this: "∩"

    Figures generated using `RSeQC software`_.

    .. _RSeQC software: http://rseqc.sourceforge.net

    **Line Plot**
"""

    if 'gb_cov_curves' in file_dict:
        report += "\n\t.. image:: " + file_dict['gb_cov_curves'] + "\n"

    report += "\n"
    report += """
    **Heatmap**\n
    This graphic may facilitate identification of biased samples.\n
    Scale: Blue = 0 Pink =1

"""

    if 'gb_cov_heatmap' in file_dict:
        report += "\n\t.. image:: " + file_dict['gb_cov_heatmap'] + "\n"

    report += "\n"
    report += """
Experimental Quality Control
===============================

Principle Component Analysis
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    High dimensional expression data are mathmatically reduced to principle
    components that can be used to describe variation across samples in fewer dimensions to allow human interpretation.
    Principle component 1 \(PC1\) accounts for the most amount of variation across samples, PC2 the second most, and so on. These PC1 vs PC2 plots
    are colored by sample annotation to demontrate how samples cluster together \(or not\) in reduced dimensional space.
    For more detailed description of Princilple Component Analysis, start with `wikipedia`_.

    .. _wikipedia: https://en.wikipedia.org/wiki/Principal_component_analysis

"""

    if 'pca_png_list' in file_dict:
        if len(file_dict['pca_png_list']) > 1:
            report += "\n\t.. image:: " + "\n\t.. image:: ".join(
                file_dict['pca_png_list'][:-1]) + "\n"
            report += "\n\t" + 'This plot indicates how much of the overall variance is described by the principle components in descending order.' + "\n\n\t.. image:: " + file_dict[
                'pca_png_list'][-1] + "\n"
        else:
            report += "\n\t.. image:: " + "\n\t.. image:: ".join(
                file_dict['pca_png_list'][0]) + "\n"

    report += "\n"
    report += """
Sample-to-Sample Correlation Heatmap
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This heatmap displays hierarchical clustering of spearman rank correlations across samples.
    Highly correlated samples will cluster together and be represented in red. Samples that do not correlate will be represented in blue.

"""

    if 'heatmapSS_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict[
            'heatmapSS_plot'] + "\n\n\t" + 'Sample-to-Sample data matrix is /analysis/plots/heatmapSS.txt' + "\n"

    report += "\n"
    report += """
Sample-Feature Correlation Heatmap
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This heatmap illustrates hierarchical clustering of samples based on the top 5 percent or roughly 1000 most variable genes or "features."
    The top colomn color annotations are presented to help identify how sample types, groups, or treatments are clustering together \(or not\).

"""

    if 'heatmapSF_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict['heatmapSF_plot'] + "\n"

    if 'sf_png_list' in file_dict:
        report += "\n\t.. image:: " + "\n\t.. image:: ".join(
            file_dict['sf_png_list'][:]) + "\n"

    report += "\n\n\t" + 'What are *these* genes?' + "\n\n\t" + 'Data used to generate this sample-feature graphic are in /analysis/plots/heatmapSF.txt' + "\n"

    report += "\n"

    if 'FUSION_OUT' in file_dict:
        report += """
Fusion Summary
==============
"""
        report += "\n\t.. image:: " + file_dict['FUSION_OUT'] + "\n"

    report += "\n"

    report += """                
Differential Gene expression
============================
    Differential gene expression analysis was performed using both `limma`_ and `DESeq2`_.\n
    Full analysis output tables are are available in /analysis/diffexp/comparison_of_interest

    .. _limma: https://www.bioconductor.org/packages/3.3/bioc/vignettes/limma/inst/doc/usersguide.pdf

    .. _DESeq2: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4302049/

    This summary image shows the number of genes that are up regulated and down regulated across each comparison at different adjusted P-value cut-offs.

"""

    if 'DEsummary_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict['DEsummary_plot'] + "\n"

    report += "\n"
    report += """
Volcano Plots
^^^^^^^^^^^^^^
    Volcano plots are commonly used graphical representations of differentially expressed genes and statistical significance.
    These scatter plots show log2 fold change versus P-value for all genes detected. Each data point represents a gene. Genes are colored red if the log2 fold change is greater than one \(log2fc > 1\). Genes are colored blue if the log2 fold change is less than negative one \(log2fc < -1\).
    The plot title indicates the direction of the comparison.
    For example, "treatment_vs_control" indicates that genes colored red are up-regulated in the treatment condition compared to the control condition with a statistically significant P-value.

"""

    if 'volcano_png_list' in file_dict:
        report += "\n\n\t.. image:: " + "\n\n\t.. image:: ".join(
            file_dict['volcano_png_list'][:]) + "\n"

    report += "\n"
    report += """
SNP Plots
==========
    
"""
    if 'SNP_chr6' in file_dict:
        report += "\n"
        report += """
SNP - Chr6
^^^^^^^^^^^
"""
        report += "\n\n\t.. image:: " + file_dict['SNP_chr6'] + "\n"

    if 'SNP_HLA' in file_dict:
        report += "\n"
        report += """
SNP - HLA
^^^^^^^^^^^
"""
        report += "\n\n\t.. image:: " + file_dict['SNP_HLA'] + "\n"

    if 'SNP_genome' in file_dict:
        report += "\n"
        report += """
SNP - Genome-wide
^^^^^^^^^^^^^^^^^^
"""
        report += "\n\n\t.. image:: " + file_dict['SNP_genome'] + "\n"

    report += """
Pathway-Analysis
================

Gene-Ontology Annotation
========================
"""
    for comp in comps:
        report += "\n" + comp + "\n"
        report += "^" * len(comp) + "\n"
        go_png = "analysis/" + config[
            "token"] + "/plots/images/" + comp + "_goterm.up.png"
        if os.path.isfile(go_png):
            report += "\n\n\t.. image:: " + data_uri(go_png) + "\n"
        else:
            report += "\nInsufficient data\n"

    report += """
KEGG-Pathway Analysis
=====================
"""
    for comp in comps:
        report += "\n" + comp + "\n"
        report += "^" * len(comp) + "\n"
        cur_path = "analysis/" + config[
            "token"] + "/diffexp/" + comp + "/kegg_pathways/"
        path_list = glob.glob(cur_path + "*.png")
        if not path_list:
            report += "\nInsufficient data\n"
        else:
            report += "\n\n\t.. image:: " + data_uri(path_list[0]) + "\n"
            token = ",".join([
                os.path.basename(file_path) for file_path in path_list[1:]
            ]).replace(".png", "")
            if token:
                report += "\n" + "More pathway plots such as, " + token + " - can be found at " + cur_path + ".\n"

#------------------------------------------------------------------------------
# GSEA section
#------------------------------------------------------------------------------
    report += """
GSEA
====
    Gene Set Enrichment Analysis was performed on the significant differentially expressed genes using the clusterProfiler R package
"""
    if 'gsea_png_list' in file_dict:
        report += "\n\n\t.. image:: " + "\n\n\t.. image:: ".join(
            file_dict['gsea_png_list'][:]) + "\n"
    report += "\n"

    #------------------------------------------------------------------------------
    # Virusseq section
    #------------------------------------------------------------------------------
    if os.path.isfile(virusseq_out):
        report += """
Virus-Seq Module Output
=======================
"""
        report += "\n" + get_sphinx_table(virusseq_out) + "\n"

    if os.path.isfile(cdr_cpk_plot):
        report += """
CDR3 analysis (using trust v2.4.1)
==================================
"""
        report += "\n\n\t.. image:: " + data_uri(cdr_cpk_plot) + "\n"

    report += "\n\n**This report is generated using VIPER version** [ `" + git_commit_string + "`_ ].\n"
    report += "\t.. _" + git_commit_string + ': ' + git_link + "\n\n"
    report += "\n\n**To cite VIPER:\nCornwell M, Vangala M, Taing L, Herbert Z, Köster J, Li B, Sun H, Li T, Zhang J, Qiu X, Pun M, Jeselsohn R, Brown M, Liu XS, Long HW. VIPER: Visualization Pipeline for RNA-seq, a Snakemake workflow for efficient and complete RNA-seq analysis. BMC Bioinformatics. 2018 Apr 12; 19(1):135.**\n\n"
    return report + "\n"
Ejemplo n.º 9
0
def get_sphinx_report(comps):
    file_dict = {
        'align_report': "analysis/STAR/STAR_Align_Report.png",
        'rRNA_report': "analysis/STAR_rRNA/STAR_rRNA_Align_Report.png",
        'read_distrib': "analysis/RSeQC/read_distrib/read_distrib.png",
        'gb_cov_heatmap': "analysis/RSeQC/gene_body_cvg/geneBodyCoverage.heatMap.png",
        'gb_cov_curves': "analysis/RSeQC/gene_body_cvg/geneBodyCoverage.curves.png",
        'heatmapSF_plot': "analysis/plots/images/heatmapSF_plot.png",
        'heatmapSS_plot': "analysis/plots/images/heatmapSS_plot.png",
        'heatmapSS_cluster': "analysis/plots/images/heatmapSS_cluster.png",
        'DEsummary_plot': "analysis/diffexp/de_summary.png",
        'SNP_chr6' : "analysis/plots/sampleSNPcorr_plot.chr6.png",
        'SNP_genome' : "analysis/plots/sampleSNPcorr_plot.genome.png"
    }
    copy_file_dict = {}
    for key in file_dict.keys():
        copy_file_dict[key] = file_dict[key]
    for file_token in file_dict.keys():
        if not os.path.isfile(file_dict[file_token]):
            del copy_file_dict[file_token]
        else:
            copy_file_dict[file_token] = data_uri(copy_file_dict[file_token])
    file_dict = copy_file_dict
    pca_png_list = []
    volcano_list = []
    SF_png_list = []

    for pca_plot in sorted(glob.glob("./analysis/plots/images/pca_plot*.png")):
        if "pca_plot_scree.png" not in pca_plot:
            pca_png_list.append(data_uri(pca_plot))

    if(os.path.isfile("./analysis/plots/images/pca_plot_scree.png")):
        pca_png_list.append(data_uri("./analysis/plots/images/pca_plot_scree.png"))    

    for volcano_plot in glob.glob("./analysis/plots/images/*_volcano.png"):
        volcano_list.append(data_uri(volcano_plot))

    for SF_plot in sorted(glob.glob("./analysis/plots/images/heatmapSF_*_plot.png")):
        SF_png_list.append(data_uri(SF_plot))

    if pca_png_list:
        file_dict['pca_png_list'] = pca_png_list
    if volcano_list:
        file_dict['volcano_png_list'] = volcano_list
    if SF_png_list:
        file_dict['sf_png_list'] = SF_png_list
    report = """
==============================================
VIPER: Visualization Pipeline for RNAseq
==============================================


Alignment Summary
=================
    Raw reads were mapped or aligned to the reference organism using `STAR software`_.

    .. _STAR software: https://github.com/alexdobin/STAR


    The **uniquely mapped read counts** and the **total read counts** for all the samples are summarized in the following image. In most cases, more than 70% of the reads should uniquely map to the reference genome.
    Contamination, low quality sequencing data, or poor library contruction may result in <60% uniquely aligned reads.

"""
    if 'align_report' in file_dict:
        report += "\n\t.. image:: " + file_dict['align_report'] + "\n";

    report += "\n"
    report += """
Library Prep Quality Metrics
=============================
Read Distribution QC
^^^^^^^^^^^^^^^^^^^^^^^^^^
    This graph displays the disibution of reads mapped to **features** across the genome for each sample. Distribution profiles should be similar across samples.
    A sample with a siginficantly different distribution profile may indicate that it was initially a poor sample or that there was a problem during library preparation.
    **mRNAseq libraries will typically contain less than 20% intronic mapped reads** whereas **totalRNAseq libraries may have greater than 40% intronic mapped reads**.
"""

    if 'read_distrib' in file_dict:
        report += "\n\t.. image:: " + file_dict['read_distrib'] + "\n";

    report += "\n"
    report += """
rRNA removal QC
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This graph displays the percentage of reads mapping to ribosomal RNA reference sequences. Most RNAseq library prep methods are designed to avoid sampling ribosomal RNAs which typically represent greater than 80% of total RNA. If rRNA removal was effective, less than 5% of the reads should map to rRNA sequences and for mRNA libraries fewer than 1%.
"""

    if 'rRNA_report' in file_dict:
        report += "\n\t.. image:: " + file_dict['rRNA_report'] + "\n";

    report += "\n"
    report += """
Genebody Coverage
^^^^^^^^^^^^^^^^^
    For accurate gene expression quantification, mapped reads should be evenly distributed across genebodies.
    Significantly skewed profiles (5' or 3') may introduce quantification bias and/or represent poor quality library preparation.\n
    For example, mRNAseq library preps typically use oligo-dT beads to capture mature transcripts and can be prone to 3' bias in genebody coverage if degraded RNA \(RIN < 7\) is used as input. This may result in inaccurate gene quantification and the following graphs will help diagnose.
    There are other prep methods that may result in 5' bias too. Ideally, coverage should be uniform across the genebody. The line plots should look like this: "∩"

    Figures generated using `RSeQC software`_.

    .. _RSeQC software: http://rseqc.sourceforge.net

    **Line Plot**
"""

    if 'gb_cov_curves' in file_dict:
        report += "\n\t.. image:: " + file_dict['gb_cov_curves'] + "\n";

    report += "\n"
    report += """
    **Heatmap**\n
    This graphic may facilitate identification of biased samples.\n
    Scale: Blue = 0 Pink =1

"""

    if 'gb_cov_heatmap' in file_dict:
        report += "\n\t.. image:: " + file_dict['gb_cov_heatmap'] + "\n";

    report += "\n"
    report += """
Experimental Quality Control
===============================

Principle Component Analysis
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    High dimensional expression data are mathmatically reduced to principle
    components that can be used to describe variation across samples in fewer dimensions to allow human interpretation.
    Principle component 1 \(PC1\) accounts for the most amount of variation across samples, PC2 the second most, and so on. These PC1 vs PC2 plots
    are colored by sample annotation to demontrate how samples cluster together \(or not\) in reduced dimensional space.
    For more detailed description of Princilple Component Analysis, start with `wikipedia`_.

    .. _wikipedia: https://en.wikipedia.org/wiki/Principal_component_analysis

"""

    if 'pca_png_list' in file_dict:
        if len(file_dict['pca_png_list']) > 1:
            report += "\n\t.. image:: " + "\n\t.. image:: ".join(file_dict['pca_png_list'][:-1]) + "\n"
            report += "\n\t" + 'This plot indicates how much of the overall variance is described by the principle components in descending order.' + "\n\n\t.. image:: " + file_dict['pca_png_list'][-1] + "\n"
        else:
            report += "\n\t.. image:: " + "\n\t.. image:: ".join(file_dict['pca_png_list'][0]) + "\n"

    report += "\n"
    report += """
Sample-to-Sample Correlation Heatmap
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This heatmap displays hierarchical clustering of spearman rank correlations across samples.
    Highly correlated samples will cluster together and be represented in red. Samples that do not correlate will be represented in blue.

"""

    if 'heatmapSS_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict['heatmapSS_plot'] + "\n\n\t" + 'Sample-to-Sample data matrix is /analysis/plots/heatmapSS.txt' + "\n"

    report += "\n"
    report += """
Sample-Feature Correlation Heatmap
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    This heatmap illustrates hierarchical clustering of samples based on the top 5 percent or roughly 1000 most variable genes or "features."
    The top colomn color annotations are presented to help identify how sample types, groups, or treatments are clustering together \(or not\).

"""

    if 'heatmapSF_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict['heatmapSF_plot'] + "\n";

    if 'sf_png_list' in file_dict:
        report += "\n\t.. image:: " + "\n\t.. image:: ".join(file_dict['sf_png_list'][:]) + "\n"

    report += "\n\n\t" + 'What are *these* genes?' + "\n\n\t" + 'Data used to generate this sample-feature graphic are in /analysis/plots/heatmapSF.txt' + "\n"

    report += "\n"

    report += """                
Differential Gene expression
============================
    Differential gene expression analysis was performed using both `limma`_ and `DESeq2`_.\n
    Full analysis output tables are are available in /analysis/diffexp/comparison_of_interest

    .. _limma: https://www.bioconductor.org/packages/3.3/bioc/vignettes/limma/inst/doc/usersguide.pdf

    .. _DESeq2: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4302049/

    This summary image shows the number of genes that are up regulated and down regulated across each comparison at different adjusted P-value cut-offs.

"""

    if 'DEsummary_plot' in file_dict:
        report += "\n\n\t.. image:: " + file_dict['DEsummary_plot'] + "\n"

    report += "\n"
    report += """
Volcano Plots
^^^^^^^^^^^^^^
    Volcano plots are commonly used graphical representations of differentially expressed genes and statistical significance.
    These scatter plots show log2 fold change versus P-value for all genes detected. Each data point represents a gene. Genes are colored red if the log2 fold change is greater than one \(log2fc > 1\). Genes are colored blue if the log2 fold change is less than negative one \(log2fc < -1\).
    The plot title indicates the direction of the comparison.
    For example, "treatment_vs_control" indicates that genes colored red are up-regulated in the treatment condition compared to the control condition with a statistically significant P-value.

"""

    if 'volcano_png_list' in file_dict:
        report += "\n\n\t.. image:: " + "\n\n\t.. image:: ".join(file_dict['volcano_png_list'][:]) + "\n"

    report += "\n"
    report += """
SNP Plots
==========
    
"""
    if 'SNP_chr6' in file_dict:
        report += "\n"
        report += """
SNP - Chr6
^^^^^^^^^^^
"""  
        report += "\n\n\t.. image:: " + file_dict['SNP_chr6'] + "\n"

    if 'SNP_genome' in file_dict:
        report += "\n" 
        report += """
SNP - Genome-wide
^^^^^^^^^^^^^^^^^^
"""
        report += "\n\n\t.. image:: " + file_dict['SNP_genome'] + "\n"

    report += """
Pathway-Analysis
================

Gene-Ontology Annotation
========================
"""
    for comp in comps:
        report += "\n" + comp + "\n"
        report += "^" * len(comp) + "\n"
        go_png = "analysis/diffexp/" + comp + "/" + comp + ".goterm.png"
        if os.path.isfile(go_png):
            report += "\n\n\t.. image:: " + data_uri(go_png) + "\n"
        else:
            report += "\nInsufficient data\n"

    report += """
KEGG-Pathway Analysis
=====================
"""
    for comp in comps:
        report += "\n" + comp + "\n"
        report += "^" * len(comp) + "\n"
        cur_path = "analysis/diffexp/" + comp + "/kegg_pathways/"
        path_list =  glob.glob(cur_path + "*.png")
        if not path_list:
            report += "\nInsufficient data\n"
        else:
            report += "\n\n\t.. image:: " + data_uri(path_list[0]) + "\n"
            token = ",".join([os.path.basename(file_path) for file_path in path_list[1:]]).replace(".png","")
            if token:
                report += "\n" + "More pathway plots such as, " + token + " - can be found at " + cur_path + ".\n"    

    return report + "\n"
Ejemplo n.º 10
0
links = {}
for vals in product(*scenarios.values()):
    sc = dict(zip(scenarios.keys(), vals))
    headline = ", ".join("{}={}".format(k, v) for k, v in sc.items())
    text += headline + "\n" + '-' * len(headline) + "\n\n"
    for p in param_values:
        sc[param] = p
        fn = tmpl.format(**sc)
        #links[fn] = [os.path.join(plot_dir, fn + '.pdf')]
        text += dedent('''
            .. figure:: {data}
               :scale: 50 %

               {param} = {value}

        ''').format(param=param,
                    value=p,
                    link=fn,
                    data=data_uri(os.path.join(plot_dir, fn + '.png')))

# text += dedent('''

#     Attachments
#     -----------
#     ''')

# text += '{} = {}: {}_\n\n.. image:: {}\n\n'.format(wildcards.param, p, fn, data_uri('results/plots/' + fn + '.png'))

report(text=text, path=snakemake.output.html, stylesheet='report.css', **links)
Ejemplo n.º 11
0
def make_star_alignment_plots(inputfile, do_qc=False, min_reads=200000, min_map=40, max_unmap=20):
    """Make star alignment plots"""
    df = pd.read_csv(inputfile, index_col=0)
    samples = list(df.index)
    # Currently hover tool and categorical variables don't play
    # nicely together in bokeh: see
    # https://github.com/bokeh/bokeh/issues/624

    # Workaround as long as categorical variables don't work with HoverTool
    df['i'] = list(range(0, len(df.index)))
    df['samples'] = samples
    df['mismatch_sum'] = df['Mismatch_rate_per_base__PCT'] + df['Deletion_rate_per_base'] + df['Insertion_rate_per_base']
    df['PCT_of_reads_unmapped'] = df['PCT_of_reads_unmapped:_other'] + df['PCT_of_reads_unmapped:_too_many_mismatches'] + df['PCT_of_reads_unmapped:_too_short']

    colors = brewer["PiYG"][3]
    colormap = {'False' : colors[0], 'True' : colors[1]}
    
    columns = [
        TableColumn(field="samples", title="Sample"),
        TableColumn(field="Number_of_input_reads", title="Number of input reads"),
        TableColumn(field="Uniquely_mapped_reads_PCT", title="Uniquely mapped reads (%)"),
        TableColumn(field="Mismatch_rate_per_base__PCT", title="Mismatch rate per base (%)"),
        TableColumn(field="Insertion_rate_per_base", title="Insertion rate per base (%)"),
        TableColumn(field="Deletion_rate_per_base", title="Deletion rate per base (%)"),
        TableColumn(field="PCT_of_reads_unmapped", title="Unmapped reads (%)"),
    ]
        
    source = ColumnDataSource(df)
    # Generate the table
    table = DataTable(source=source, columns=columns, editable=False, width = 1000)

    # Default tools, plot_config and tooltips
    TOOLS="pan,wheel_zoom,box_zoom,box_select,lasso_select,reset,save,hover"
    plot_config=dict(plot_width=400, plot_height=400, tools=TOOLS, title_text_font_size='12pt',
                     x_axis_type = 'linear', x_range = [0, len(samples)],
                     xaxis = {'axis_label' : 'sample', 'axis_label_text_font_size' : '10pt', 'major_label_orientation' : np.pi/3},
                     yaxis = {'axis_label' : 'reads', 'axis_label_text_font_size' : '10pt', 'major_label_orientation' : np.pi/3}
                     )

    # Number of input reads
    c1 = list(map(lambda x: colormap[str(x)], df['Number_of_input_reads'] < min_reads)) if do_qc else "blue"
    qc = QCArgs(x=[0,len(samples)], y=[min_reads, min_reads], line_dash=[2,4]) if do_qc else None
    p1 = scatterplot(x='i', y='Number_of_input_reads', source=source, color=c1, qc=qc,
                     title="Number of input reads",
                     tooltips = [{'type':HoverTool, 'tips' : [('Sample', '@samples'),('Reads', '@Number_of_input_reads'),]}],
                     y_range=[0, max(df['Number_of_input_reads'])],
                     y_axis_type = "log",
                     **plot_config)

    # Uniquely mapped reads
    plot_config.update({'y_axis_type' : 'linear', 'axis_label' : 'percent (%)'})
    c2 = list(map(lambda x: colormap[str(x)], df['Uniquely_mapped_reads_PCT'] < min_map))  if do_qc else "blue"
    qc = QCArgs(x=[0,len(samples)], y=[min_map, min_map], line_dash=[2,4]) if do_qc else None
    p2 = scatterplot(x='i', y='Uniquely_mapped_reads_PCT', source=source, color=c2, qc=qc,
                     title="Uniquely mapping reads",
                     y_range = [0, 100],
                     tooltips = [{'type':HoverTool, 'tips' : [('Sample', '@samples'),('Pct_mapped', '@Uniquely_mapped_reads_PCT'),]}],
                     **plot_config)

    # Mapping reads in general
    c3 = list(map(lambda x: colormap[str(x)], df['PCT_of_reads_unmapped'] > max_unmap))  if do_qc else "blue"
    qc = QCArgs(x=[0,len(samples)], y=[max_unmap, max_unmap], line_dash=[2,4]) if do_qc else None
    p3 = scatterplot(x='i', y='PCT_of_reads_unmapped',
                     source=source, color=c3, qc=qc, title="Unmapped reads",
                     y_range = [0, 100],
                     tooltips = [{'type':HoverTool, 'tips' : [('Sample', '@samples'),('Pct_unmapped', '@PCT_of_reads_unmapped'),]}], **plot_config)
    
    # Mismatch/indel rate
    plot_config['tools'] = TOOLS.replace("lasso_select,", "")
    plot_config['yaxis'].update({'axis_label' : 'Rate per base'})
    p4 = scatterplot(x='i', y = ['Mismatch_rate_per_base__PCT', 'Insertion_rate_per_base', 'Deletion_rate_per_base'],
                color = ["blue", "red", "green"], source = source,
                title = "Mismatch and indel rates",
                tooltips =  [{'type':HoverTool, 'tips' : [('Sample', '@samples'),
                                                            ('Mismatch rate per base', '@Mismatch_rate_per_base__PCT'),
                                                            ('Insertion rate per base', '@Insertion_rate_per_base'),
                                                            ('Deletion rate per base', '@Deletion_rate_per_base'),
                                                          ]},
                                                          ],
                **plot_config)
    select_tool = p4.select(dict(type=BoxSelectTool))
    select_tool.dimensions=['width']

    # Plot sum
    plot_config['yaxis'].update({'axis_label' : 'Mismatch/indel sum'})
    c5 = list(map(lambda x: colormap[str(x)], df['mismatch_sum'] > 1.0))  if do_qc else "blue"
    qc = QCArgs(x=[0,len(samples)], y=[1.0, 1.0], line_dash=[2,4]) if do_qc else None
    p5 = scatterplot(x='i', y='mismatch_sum',
                     source=source, color=c5, qc=qc, title="Mismatch / indel sum",
                     tooltips = [{'type':HoverTool, 'tips' : [('Sample', '@samples'),('Mismatch/indel rate per base', '@mismatch_sum'),]}], **plot_config)
    select_tool = p5.select(dict(type=BoxSelectTool))
    select_tool.dimensions=['width']

    # Plot histogram of ratio
    # plot_config['tools'] = "pan,box_zoom,reset,save"
    # p6 = figure(title="Histogram of mismatch and indel rates", **plot_config)
    # hist, edges = np.histogram(df['mismatch_sum'], density=False, bins=50)
    # p6.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
    #    fill_color="#036564", line_color="#033649")
    # p6.xaxis.axis_label = "Mismatch/indel sum"
    # p6.yaxis.axis_label = "Count"

    df_qc = None
    if do_qc:
        # QC summary table
        d = {'samples':samples,
            'read_filter' : df['Number_of_input_reads'] < min_reads,
            'map_filter' : df['Uniquely_mapped_reads_PCT'] < min_map,
            'mismatch_filter' : df['mismatch_sum'] > 1.0,
            }
        d['filter'] = d['read_filter'] | d['map_filter'] | d['mismatch_filter']
        df_qc = pd.DataFrame(data=d, index=df.samples)
    
    return {'fig' : VBox(children=[gridplot([[p1, p2, p3]]), HBox(children=[gridplot([[p4, p5]])])]),
            'table' : table, 'qctable' : df_qc,
            'uri' : data_uri(inputfile),
            'file' : inputfile}