Example #1
0
def main():
    """Run the entry point."""
    parser = argparse.ArgumentParser()
    parser.add_argument("report", help="Report output file")
    parser.add_argument("summary", help="Guppy demultiplexing summary file.")
    args = parser.parse_args()

    report = WFReport("Read Demultiplexing Report", "wf-demultiplex")

    section = report.add_section()
    section.markdown('''
### Summary
The chart below depicts simply the number of reads found for each barcode.
''')
    df = read_files([args.summary])
    counts = df.value_counts(subset=['barcode_arrangement']) \
        .reset_index().sort_values(by=['barcode_arrangement']) \
        .rename(columns={0: 'count'})
    plot = bars.simple_bar(counts['barcode_arrangement'].astype(str),
                           counts['count'],
                           colors=[Colors.cerulean] * len(counts),
                           title='Number of reads per barcode.')
    plot.xaxis.major_label_orientation = 3.14 / 2
    section.plot(plot)

    # write report
    report.write(args.report)
Example #2
0
def sample_read_counts(seq_summary, min_len=None, max_len=None):
    """Create bar plot counting unique samples.

    :param seq_summary: summary data from fastcat.
    :param min_len: minimum length.
    :param max_len: maximum length.
    """
    if min_len is not None:
        seq_summary = seq_summary.loc[(seq_summary['read_length'] > min_len)]
    if max_len is not None:
        seq_summary = seq_summary.loc[(seq_summary['read_length'] < max_len)]
    sample_counts = (pd.DataFrame(seq_summary['sample_name'].value_counts()).
                     sort_index().reset_index().rename(columns={
                         'index': 'sample',
                         'sample_name': 'count'
                     }))

    title = 'Number of reads per barcode'
    if min_len is not None or max_len is not None:
        t0, t1 = "", ""
        if min_len is not None:
            t0 = "{} < ".format(min_len)
        if max_len is not None:
            t1 = " > {}".format(max_len)
        title += " (filtered by {}length{}).".format(t0, t1)

    plot = bars.simple_bar(sample_counts['sample'].astype(str),
                           sample_counts['count'],
                           colors=[Colors.cerulean] * len(sample_counts),
                           title=title,
                           plot_width=None)
    plot.xaxis.major_label_orientation = 3.14 / 2
    return plot
Example #3
0
def decorator_example(groups, counts):
    """Examplify a decorator."""
    p = bars.simple_bar(groups, counts)
    return p
Example #4
0
def main():
    """Run entry point."""
    parser = argparse.ArgumentParser()
    parser.add_argument("status", help="artic status file")
    parser.add_argument("output", help="Report output filename")
    parser.add_argument("--nextclade", help="nextclade json output file")
    parser.add_argument("--pangolin", help="pangolin CSV output file")
    parser.add_argument("--depths",
                        nargs='+',
                        required=True,
                        help="Depth summary files")
    parser.add_argument("--summaries",
                        nargs='+',
                        required=True,
                        help="Sequencing summary files")
    parser.add_argument("--bcftools_stats",
                        nargs='+',
                        required=True,
                        help="Outputs from bcftools stats")
    parser.add_argument("--genotypes",
                        nargs='+',
                        required=False,
                        help="Genotyping summary files")
    parser.add_argument("--min_cover",
                        default=20,
                        type=int,
                        help="Minimum locus coverage for variant call.")
    parser.add_argument("--min_len",
                        default=300,
                        type=int,
                        help="Minimum read length")
    parser.add_argument("--max_len",
                        default=700,
                        type=int,
                        help="Maximum read length")
    parser.add_argument(
        "--report_depth",
        default=100,
        type=int,
        help=("Depth at which to provide a coverage statistics, "
              "e.g. 76% of genome covered at `report_depth`"))
    parser.add_argument("--hide_coverage",
                        action="store_true",
                        help="Do not display coverage plots in report.")
    parser.add_argument("--hide_variants",
                        action="store_true",
                        help="Do not display variant summary in report.")
    parser.add_argument("--revision",
                        default='unknown',
                        help="git branch/tag of the executed workflow")
    parser.add_argument("--commit",
                        default='unknown',
                        help="git commit of the executed workflow")
    parser.add_argument("--params",
                        default=None,
                        help="A csv containing the parameter key/values")
    parser.add_argument(
        "--versions", help="directory contained CSVs containing name,version.")
    args = parser.parse_args()

    report_doc = report.WFReport("SARS-CoV-2 ARTIC Sequencing report",
                                 "wf-artic",
                                 revision=args.revision,
                                 commit=args.commit)

    section = report_doc.add_section()
    section.markdown('''
### Read Quality control

This section displays basic QC metrics indicating read data quality.
''')
    # read length summary
    seq_summary = read_files(args.summaries)
    total_bases = seq_summary['read_length'].sum()
    mean_length = total_bases / len(seq_summary)
    median_length = np.median(seq_summary['read_length'])
    datas = [seq_summary['read_length']]
    length_hist = hist.histogram(datas,
                                 colors=[Colors.cerulean],
                                 binwidth=50,
                                 title="Read length distribution.",
                                 x_axis_label='Read Length / bases',
                                 y_axis_label='Number of reads',
                                 xlim=(0, 2000))
    length_hist = annot.marker_vline(length_hist,
                                     args.min_len,
                                     label="Min: {}".format(args.min_len),
                                     text_baseline='bottom',
                                     color='grey')
    length_hist = annot.marker_vline(length_hist,
                                     args.max_len,
                                     label="Max: {}".format(args.max_len),
                                     text_baseline='top')
    length_hist = annot.subtitle(
        length_hist,
        "Mean: {:.0f}. Median: {:.0f}".format(mean_length, median_length))

    datas = [seq_summary['mean_quality']]
    mean_q, median_q = np.mean(datas[0]), np.median(datas[0])
    q_hist = hist.histogram(datas,
                            colors=[Colors.cerulean],
                            bins=100,
                            title="Read quality score",
                            x_axis_label="Quality score",
                            y_axis_label="Number of reads",
                            xlim=(4, 25))
    q_hist = annot.subtitle(
        q_hist, "Mean: {:.0f}. Median: {:.0f}".format(mean_q, median_q))

    # barcode count plot
    good_reads = seq_summary.loc[(seq_summary['read_length'] > args.min_len)
                                 & (seq_summary['read_length'] < args.max_len)]
    barcode_counts = (pd.DataFrame(good_reads['sample_name'].value_counts()).
                      sort_index().reset_index().rename(columns={
                          'index': 'sample',
                          'sample_name': 'count'
                      }))

    bc_counts = bars.simple_bar(barcode_counts['sample'].astype(str),
                                barcode_counts['count'],
                                colors=[Colors.cerulean] * len(barcode_counts),
                                title=('Number of reads per barcode '
                                       '(filtered by {} < length < {})'.format(
                                           args.min_len, args.max_len)),
                                plot_width=None)
    bc_counts.xaxis.major_label_orientation = 3.14 / 2
    section.plot(
        layout([[length_hist, q_hist], [bc_counts]],
               sizing_mode="stretch_width"))

    section = report_doc.add_section()
    section.markdown("""
### Artic Analysis status

The panel below lists samples which failed to produce
results from the primary ARTIC analysis. Samples not listed here were analysed
successfully, but may still contain inconclusive or invalid results. See the
following sections for further indications of failed or inconclusive results.
""")
    status = pd.read_csv(args.status, sep='\t')
    failed = status.loc[status['pass'] == 0]
    if len(failed) == 0:
        fail_list = "All samples analysed successfully"
    else:
        fail_list = failed['sample'].str.cat(sep=', ')
    section.markdown("""
```{}```
""".format(fail_list))
    fail_percentage = int(100 * len(failed) / len(status))
    classes = ['Success', 'Analysis Failed']
    values = [100 - fail_percentage, fail_percentage]
    colors = ['#54B8B1', '#EF4135']
    plot = bars.single_hbar(values,
                            classes,
                            colors,
                            title="Completed analyses",
                            x_axis_label="%age Samples")
    plot.x_range = Range1d(0, 140)
    section.plot(plot)

    if not args.hide_coverage:
        section = report_doc.add_section()
        section.markdown('''
### Genome coverage

Plots below indicate depth of coverage from data used
within the Artic analysis coloured by amplicon pool.  Variant filtering during
the ARTIC analysis mandates a minimum coverage of at least {}X at
variant/genotyping loci for a call to be made.

***NB: To better display all possible data, the depth axes of the plots below
are not tied between plots for different samples. Care should be taken in
comparing depth across samples.***
'''.format(args.min_cover))

        # depth summary by amplicon pool
        df = read_files(args.depths)
        plots_pool = list()
        plots_orient = list()
        plots_combined = list()
        depth_lim = args.report_depth
        for sample in sorted(df['sample_name'].unique()):
            bc = df['sample_name'] == sample
            depth = df[bc].groupby('pos').sum().reset_index()
            depth_thresh = \
                100*(depth['depth'] >= depth_lim).sum() / len(depth['depth'])
            depth_mean = depth['depth'].mean()

            # total depth plot
            # plot line just to get aplanat niceities
            p = lines.line(
                [depth['pos']],
                [depth['depth']],
                colors=[Colors.cerulean],
                title="{}: {:.0f}X, {:.1f}% > {}X".format(
                    sample, depth_mean, depth_thresh, depth_lim),
                height=250,
                width=400,
                x_axis_label='position',
                y_axis_label='depth',
            )
            p.varea(x=depth['pos'],
                    y1=0.1,
                    y2=depth['depth'],
                    fill_color=Colors.cerulean)
            plots_combined.append(p)

            # fwd/rev
            xs = [depth['pos'], depth['pos']]
            ys = [depth['depth_fwd'], depth['depth_rev']]
            names = ['fwd', 'rev']
            colors = [Colors.dark_gray, Colors.verdigris]

            p = lines.line(xs,
                           ys,
                           colors=colors,
                           names=names,
                           title="{}: {:.0f}X, {:.1f}% > {}X".format(
                               sample, depth_mean, depth_thresh, depth_lim),
                           height=250,
                           width=400,
                           x_axis_label='position',
                           y_axis_label='depth')
            for x, y, name, color in zip(xs, ys, names, colors):
                p.varea(x=x,
                        y1=0,
                        y2=y,
                        legend_label=name,
                        fill_color=color,
                        alpha=0.7,
                        muted_color=color,
                        muted_alpha=0.2)
            p.legend.click_policy = 'mute'
            plots_orient.append(p)

            # primer set plot
            pset = df['primer_set']
            xs = [df.loc[(pset == i) & bc]['pos'] for i in (1, 2)]
            ys = [df.loc[(pset == i) & bc]['depth'] for i in (1, 2)]
            names = ['pool-1', 'pool-2']
            colors = [Colors.light_cornflower_blue, Colors.feldgrau]

            p = lines.line(xs,
                           ys,
                           colors=colors,
                           names=names,
                           title="{}: {:.0f}X, {:.1f}% > {}X".format(
                               sample, depth_mean, depth_thresh, depth_lim),
                           height=250,
                           width=400,
                           x_axis_label='position',
                           y_axis_label='depth')
            for x, y, name, color in zip(xs, ys, names, colors):
                p.varea(x=x,
                        y1=0,
                        y2=y,
                        legend_label=name,
                        fill_color=color,
                        alpha=0.7,
                        muted_color=color,
                        muted_alpha=0.2)
            p.legend.click_policy = 'mute'
            plots_pool.append(p)

        tab1 = Panel(child=gridplot(plots_combined, ncols=3),
                     title="Coverage Plot")
        tab2 = Panel(child=gridplot(plots_pool, ncols=3),
                     title="By amplicon pool")
        tab3 = Panel(child=gridplot(plots_orient, ncols=3),
                     title="By read orientation")
        cover_panel = Tabs(tabs=[tab1, tab2, tab3])
        section.plot(cover_panel)

    # canned VCF stats report component
    if not args.hide_variants:
        section = report_doc.add_section()
        bcfstats.full_report(args.bcftools_stats, report=section)

    # NextClade analysis
    if args.nextclade is not None:
        section = report_doc.add_section(
            section=nextclade.NextClade(args.nextclade))
        section.markdown(
            "*Note: For targeted sequencing, such as SpikeSeq, Nextclade "
            "may report 'Missing data' QC fails. This is expected and not "
            "a concern provided the regions of interest are not reported "
            "as missing.*")

    # Pangolin analysis
    if args.pangolin is not None:
        section = report_doc.add_section()
        section.markdown('''
### Lineage

The table below reports the lineage of each sample as calculated by
[pangolin](https://github.com/cov-lineages/pangolin).

''')
        section.table(pd.read_csv(args.pangolin), index=False)

    # Genotyping
    if args.genotypes is not None:
        section = report_doc.add_section()
        section.markdown('''
### Genotyping

The table below lists whether candidate variants were determined to exist
within each sample.

The ARTIC workflow pre-filters (removes) candidate variants according to the
criteria `variant_score < 20` and `coverage < 20`. The table draws attention to
reference calls of low coverage (<20 reads) which may therefore be inaccurate.
''')
        df = read_files(args.genotypes, sep=',')
        df = df[[
            'Sample', 'Result', 'Date Tested', 'Lab ID', 'testKit',
            'CH1-Target', 'CH1-Result', 'CH1-Conf'
        ]]
        df = df.sort_values(by=['Sample', 'CH1-Target'], ascending=True)
        section.table(df, index=False)

    section = report_doc.add_section()
    section.markdown('''
### Software versions

The table below highlights versions of key software used within the analysis.
''')
    versions = list()
    if args.versions is not None:
        for fname in os.listdir(args.versions):
            print("Reading versions from file:", fname)
            try:
                with open(os.path.join(args.versions, fname), 'r') as fh:
                    for line in fh.readlines():
                        name, version = line.strip().split(',')
                        versions.append((name, version))
            except Exception as e:
                print(e)
                pass
    versions = pd.DataFrame(versions, columns=('Name', 'Version'))
    section.table(versions, index=False)

    # Params reporting
    section = report_doc.add_section()
    section.markdown('''
### Workflow parameters

The table below highlights values of the main parameters used in this analysis.
''')
    df_params = load_params(args.params)
    section.table(df_params, index=False)

    # write report
    report_doc.write(args.output)
Example #5
0
def main():
    """Run the entry point."""
    parser = argparse.ArgumentParser()
    parser.add_argument("report", help="Report output file")
    parser.add_argument("--summaries",
                        nargs='+',
                        required=True,
                        help="Read summary file.")
    parser.add_argument("--lineages",
                        nargs='+',
                        required=True,
                        help="Read lineage file.")
    parser.add_argument("--vistempl", required=True)
    parser.add_argument(
        "--versions",
        required=True,
        help="directory containing CSVs containing name,version.")
    parser.add_argument(
        "--params",
        default=None,
        required=True,
        help="A JSON file containing the workflow parameter key/values")
    parser.add_argument("--revision",
                        default='unknown',
                        help="git branch/tag of the executed workflow")
    parser.add_argument("--commit",
                        default='unknown',
                        help="git commit of the executed workflow")
    args = parser.parse_args()

    report = WFReport("Workflow Metagenomics Report",
                      "wf-metagenomics",
                      revision=args.revision,
                      commit=args.commit)

    templ = None
    with open(args.vistempl, "r") as vistempl:
        templ = vistempl.read()

    sample_lineages = {}
    for lineage in natsort.natsorted(args.lineages):
        lineage_name = lineage.split('.')[0]
        with open(lineage, 'r') as lf:
            sample_lineages[lineage_name] = json.load(lf)

    templ = templ.replace("replace_me",
                          json.dumps(sample_lineages).replace('"', '\\"'))
    report.template = Template(templ)

    #
    # Plot read counts per barcode
    #
    seq_summary = read_files(args.summaries)
    bc_counts = (pd.DataFrame(seq_summary['sample_name'].value_counts()).
                 sort_index().reset_index().rename(columns={
                     'index': 'sample',
                     'sample_name': 'count'
                 }))
    bc_counts_plot = bars.simple_bar(bc_counts['sample'].astype(str),
                                     bc_counts['count'],
                                     colors=[Colors.cerulean] * len(bc_counts),
                                     title='Number of reads per sample',
                                     plot_width=None)
    bc_counts_plot.xaxis.major_label_orientation = 3.14 / 2

    section = report.add_section()
    section.markdown("### Samples")
    section.plot(layout([[bc_counts_plot]], sizing_mode="stretch_width"))

    #
    # Standard read metrics
    #
    for summ in args.summaries:
        section = report.add_section(section=fastcat.full_report(
            [summ],
            header='#### Read stats: {}'.format(str(summ.split('.')[0]))))

    #
    # Standard wf reporting
    #
    report.add_section(section=scomponents.version_table(args.versions))
    report.add_section(section=scomponents.params_table(args.params))

    report.write(args.report)