Example #1
0
    def frequencies_plot(self, xmin=0, xmax=200):
        """ Generate the qualities plot """

        helptext = '''
            A possible way to assess the complexity of a library even in
            absence of a reference sequence is to look at the kmer profile of the reads.
            The idea is to count all the kmers (_i.e._, sequence of length `k`) that occur
            in the reads. In this way it is possible to know how many kmers occur
            `1,2,.., N` times and represent this as a plot.
            This plot tell us for each x, how many k-mers (y-axis) are present in the
            dataset in exactly x-copies.

            In an ideal world (no errors in sequencing, no bias, no  repeated regions)
            this plot should be as close as  possible to a gaussian distribution.
            In reality we will always see a peak for `x=1` (_i.e._, the errors)
            and another peak close to the expected coverage. If the genome is highly
            heterozygous a second peak at half of the coverage can be expected.'''

        pconfig = {
            'id': 'Jellyfish_kmer_plot',
            'title': 'Jellyfish: K-mer plot',
            'ylab': 'Counts',
            'xlab': 'k-mer frequency',
            'xDecimals': False,
            'xmin': xmin,
            'xmax': xmax
        }

        self.add_section(
            anchor = 'jellyfish_kmer_plot',
            description = 'The K-mer plot lets you estimate library complexity and coverage from k-mer content.',
            helptext = helptext,
            plot = linegraph.plot(self.jellyfish_data, pconfig)
        )
Example #2
0
    def mirtrace_complexity_plot(self):
        """ Generate the miRTrace miRNA Complexity Plot"""

        data = dict()
        for s_name in self.complexity_data:
            try:
                data[s_name] = {int(self.complexity_data[s_name][d]) : int(d) for d in self.complexity_data[s_name]}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('No valid data for miRNA complexity')
            return None

        config = {
            'id': 'mirtrace_complexity_plot',
            'title': 'miRTrace: miRNA Complexity Plot',
            'ylab': 'Distinct miRNA Count',
            'xlab': 'Number of Sequencing Reads',
            'ymin': 0,
            'xmin': 1,
            'xDecimals': False,
            'tt_label': '<b>Number of Sequencing Reads {point.x}</b>: {point.y} Distinct miRNA Count',
        }

        return linegraph.plot(data, config)
Example #3
0
def _add_hs_penalty(data):
    subtitle = "The \"hybrid selection penalty\" incurred to get 80% of target bases to a given coverage. Can be used with the formula <code>required_aligned_bases = bait_size_bp * desired_coverage * hs_penalty</code>."
    data_clean = defaultdict(dict)
    any_non_zero = False
    for s in data:
        for h in data[s]:
            if h.startswith("HS_PENALTY"):
                data_clean[s][(h.replace("HS_PENALTY_", " ")[:-1])] = data[s][h]
                if data[s][h] > 0:
                    any_non_zero = True

    pconfig = { 'id': 'picard_hybrid_selection_penalty',
                'title': 'Picard: Hybrid Selection Penalty',
                'xlab': 'Fold Coverage',
                'ylab': 'Pct of bases',
                'ymax': 100,
                'ymin': 0,
                'xmin': 0,
                'tt_label': '<b>{point.x}X</b>: {point.y:.2f}%',}

    if any_non_zero:
        return {
            'name': 'HS penalty',
            'anchor': 'picard_hsmetrics_hs_penalty',
            'description': subtitle,
            'plot': linegraph.plot(data_clean, pconfig)
        }
Example #4
0
    def fiveprime_plot(self):
        """Generate a 5' C>T linegraph plot"""

        data = dict()
        dict_to_add = dict()
        # Create tuples out of entries
        for key in self.fivepCtoTfreq_data:
            pos = list(range(1,len(self.fivepCtoTfreq_data.get(key))))
            tmp = [i * 100.0 for i in self.fivepCtoTfreq_data.get(key)]
            tuples = list(zip(pos,tmp))
            # Get a dictionary out of it
            data = dict((x, y) for x, y in tuples)
            dict_to_add[key] = data

        config = {
            'id': 'fiveprime_misinc_plot',
            'title': 'DamageProfiler: 5\' C>T misincorporation plot',
            'ylab': '% C to T substituted',
            'xlab': 'Nucleotide position from 5\'',
            'tt_label': '{point.y:.2f} % C>T misincorporations at nucleotide position {point.x}',
            'ymin': 0,
            'xmin': 1
        }

        return linegraph.plot(dict_to_add,config)
Example #5
0
    def chart_retention_dist(self):

        ## cytosine retention distribution
        mdata_meth = self.mdata['retention_dist']
        mdata = self.mdata['retention_dist_byread']

        pd = [
            mdata_meth,
            dict([(sid, dd['CA']) for sid, dd in mdata.items()]),
            dict([(sid, dd['CC']) for sid, dd in mdata.items()]),
            dict([(sid, dd['CG']) for sid, dd in mdata.items()]),
            dict([(sid, dd['CT']) for sid, dd in mdata.items()]),
        ]
        self.add_section(
            name = 'Number of Retention Distribution',
            anchor = 'biscuit-retention-read',
            description = "This plot shows the distribution of the number of retained cytosine in each read, up to 10.",
            plot = linegraph.plot(pd, {
                'id': 'biscuit_retention_read_cpa', 
                'xlab': 'Number of Retention within Read',
                'title': 'BISCUIT: Retention Distribution',
                'data_labels': [
                    {'name': 'CpG retention', 'ylab': 'Fraction of cytosine in CpG context', 'xlab': 'Retention Level (%)'},
                    {'name': 'Within-read CpA', 'ylab': 'Number of Reads'},
                    {'name': 'Within-read CpC', 'ylab': 'Number of Reads'},
                    {'name': 'Within-read CpG', 'ylab': 'Number of Reads'},
                    {'name': 'Within-read CpT', 'ylab': 'Number of Reads'},
                ]})
            )
Example #6
0
    def mirtrace_length_plot(self):
        """ Generate the miRTrace Read Length Distribution"""

        data = dict()
        for s_name in self.length_data:
            try:
                data[s_name] = {int(d): int(self.length_data[s_name][d]) for d in self.length_data[s_name]}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('No valid data for read length distribution')
            return None

        config = {
            'id': 'mirtrace_length_plot',
            'title': 'miRTrace: Read Length Distribution',
            'ylab': 'Read Count',
            'xlab': 'Read Lenth (bp)',
            'ymin': 0,
            'xmin': 0,
            'xDecimals': False,
            'tt_label': '<b>Read Length (bp) {point.x}</b>: {point.y} Read Count',
            'xPlotBands': [
                {'from': 40, 'to': 50, 'color': '#ffebd1'},
                {'from': 26, 'to': 40, 'color': '#e2f5ff'},
                {'from': 18, 'to': 26, 'color': '#e5fce0'},
                {'from': 0, 'to': 18, 'color': '#ffffe2'},
            ]
        }

        return linegraph.plot(data, config)
Example #7
0
    def threeprime_plot(self):
        """Generate a 3' G>A linegraph plot"""

        data = dict()
        dict_to_add = dict()
        # Create tuples out of entries
        for key in self.threepGtoAfreq_data:
            pos = list(range(1,len(self.threepGtoAfreq_data.get(key))))
            #Multiply values by 100 to get %
            tmp = [i * 100.0 for i in self.threepGtoAfreq_data.get(key)]
            tuples = list(zip(pos,tmp))
            # Get a dictionary out of it
            data = dict((x, y) for x, y in tuples)
            dict_to_add[key] = data

        config = {
            'id': 'threeprime_misinc_plot',
            'title': 'DamageProfiler: 3P G>A misincorporation plot',
            'ylab': '% G to A substituted',
            'xlab': 'Nucleotide position from 3\'',
            'tt_label': '{point.y:.2f} % G>A misincorporations at nucleotide position {point.x}',
            'ymin': 0,
            'xmin': 1
        }

        return linegraph.plot(dict_to_add,config)
    def parse_bamPEFragmentSizeDistribution(self):
        """Find bamPEFragmentSize output. Supports the --outRawFragmentLengths option"""
        self.deeptools_bamPEFragmentSizeDistribution = dict()
        for f in self.find_log_files('deeptools/bamPEFragmentSizeDistribution', filehandles=False):
            parsed_data = self.parseBamPEFDistributionFile(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_bamPEFragmentSizeDistribution:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_bamPEFragmentSizeDistribution[k] = v
            if len(parsed_data) > 0:
                self.add_data_source(f, section='bamPEFragmentSizeDistribution')

        if len(self.deeptools_bamPEFragmentSizeDistribution) > 0:
            config = {
                'id': 'fragment_size_distribution_plot',
                'title': 'deeptools: Fragment Size Distribution Plot',
                'ylab': 'Occurrence',
                'xlab': 'Fragment Size (bp)',
                'smooth_points': 50,
                'xmax': 1000,
                'xDecimals': False,
                'tt_label': '<b>Fragment Size (bp) {point.x}</b>: {point.y} Occurrence',
            }

            self.add_section (
                name = 'Fragment size distribution',
                anchor = 'fragment_size_distribution',
                description="Distribution of paired-end fragment sizes",
                plot=linegraph.plot(self.deeptools_bamPEFragmentSizeDistribution, config)
            )

        return len(self.deeptools_bamPEFragmentSizeDistribution)
Example #9
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name', c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name = modname,
            anchor = mod['config'].get('section_anchor', c_id),
            href = mod['config'].get('section_href'),
            info = mod['config'].get('description')
        )

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section( plot = table.plot(mod['data'], headers, pconfig) )
            self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) )

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) )

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section( plot = linegraph.plot(mod['data'], pconfig) )

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section( plot = scatter.plot(mod['data'], pconfig) )

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) )

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section( plot = beeswarm.plot(mod['data'], pconfig) )

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section( content = mod['data'] )

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section( content = mod['data'] )

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
Example #10
0
    def bcbio_coverage_avg_chart_deprecated_in_1_0_6(self, names):
        """ Make the bcbio assignment rates plot
            (from the old-style file before mosdepth integration,
            deprectated since bcbio 1.0.6 """

        x_threshold = 0
        data = defaultdict(dict)
        for f in self.find_log_files(names):
            s_name = self.clean_s_name(f['fn'], root=None)
            for line in f['f'].split("\n"):
                if not line.startswith("percentage"):
                    continue
                cutoff_reads, bases_pct, sample = line.split("\t")
                y = float(bases_pct)
                x = int(cutoff_reads.replace("percentage", ""))
                data[s_name][x] = y
                if y > 1.0:
                    x_threshold = max(x_threshold, x)

            if s_name in data:
                self.add_data_source(f)

        if data:
            return linegraph.plot(data, {
                'xlab': 'Coverage (X)',
                "ylab": '% bases in genome or rarget covered by least X reads',
                'ymax': 100,
                "xmax": x_threshold,
            })
Example #11
0
 def plot_readlengths(self):
     pdata = [
         { s_name: d['All reads']['reads'] for s_name,d in self.minionqc_raw_data.items() },
         { s_name: d['All reads']['gigabases'] for s_name,d in self.minionqc_raw_data.items() }
     ]
     pconfig = {
         'id': 'minionqc_read_lengths',
         'title': 'MinIONQC: Output versus read length',
         'categories': True,
         'data_labels': [
             {'name': 'All reads: Num reads', 'ylab': '# reads'},
             {'name': 'All reads: Num gigabases', 'ylab': '# gigabases'}
         ]
     }
     for qfilt in list(self.q_threshold_list):
         try:
             pdata.extend([
                 { s_name: d[qfilt]['reads'] for s_name,d in self.minionqc_raw_data.items() },
                 { s_name: d[qfilt]['gigabases'] for s_name,d in self.minionqc_raw_data.items() },
             ])
             pconfig['data_labels'].extend([
                 {'name': '{}: Num reads'.format(qfilt), 'ylab': '# reads'},
                 {'name': '{}: Num gigabases'.format(qfilt), 'ylab': '# gigabases'},
             ])
         except KeyError:
             pass
     self.add_section (
         name = 'Read length output',
         anchor = 'minionqc-read-length-output',
         description = 'Number of reads / bp sequenced at given read length thresholds.',
         plot = linegraph.plot(pdata, pconfig=pconfig)
     )
Example #12
0
 def coverage_lineplot (self):
     """ Make HTML for coverage line plots """
     # Add line graph to section
     data = list()
     data_labels = list()
     if len(self.rna_seqc_norm_high_cov) > 0:
         data.append(self.rna_seqc_norm_high_cov)
         data_labels.append({'name': 'High Expressed'})
     if len(self.rna_seqc_norm_medium_cov) > 0:
         data.append(self.rna_seqc_norm_medium_cov)
         data_labels.append({'name': 'Medium Expressed'})
     if len(self.rna_seqc_norm_low_cov) > 0:
         data.append(self.rna_seqc_norm_low_cov)
         data_labels.append({'name': 'Low Expressed'})
     pconfig = {
         'id': 'rna_seqc_mean_coverage_plot',
         'title': 'RNA-SeQC: Gene Body Coverage',
         'ylab': '% Coverage',
         'xlab': "Gene Body Percentile (5' -> 3')",
         'xmin': 0,
         'xmax': 100,
         'tt_label': "<strong>{point.x}% from 5'</strong>: {point.y:.2f}",
         'data_labels': data_labels
     }
     if len(data) > 0:
         self.add_section (
             name = 'Gene Body Coverage',
             anchor = 'rseqc-rna_seqc_mean_coverage',
             helptext = 'The metrics are calculated across the transcripts with tiered expression levels.',
             plot = linegraph.plot(data, pconfig)
         )
Example #13
0
    def cutadapt_length_trimmed_plot (self):
        """ Generate the trimming length plot """

        description = 'This plot shows the number of reads with certain lengths of adapter trimmed. \n\
        Obs/Exp shows the raw counts divided by the number expected due to sequencing errors. A defined peak \n\
        may be related to adapter length. See the \n\
        <a href="http://cutadapt.readthedocs.org/en/latest/guide.html#how-to-read-the-report" target="_blank">cutadapt documentation</a> \n\
        for more information on how these numbers are generated.'

        pconfig = {
            'id': 'cutadapt_plot',
            'title': 'Cutadapt: Lengths of Trimmed Sequences',
            'ylab': 'Counts',
            'xlab': 'Length Trimmed (bp)',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
            'data_labels': [{'name': 'Counts', 'ylab': 'Count'},
                            {'name': 'Obs/Exp', 'ylab': 'Observed / Expected'}]
        }

        self.add_section(
            description = description,
            plot = linegraph.plot([self.cutadapt_length_counts, self.cutadapt_length_obsexp], pconfig)
        )
Example #14
0
    def sequence_quality_plot (self):
        """ Create the HTML for the phred quality score plot """

        data = dict()
        for s_name in self.fastqc_data:
            try:
                data[s_name] = {self.avg_bp_from_range(d['base']): d['mean'] for d in self.fastqc_data[s_name]['per_base_sequence_quality']}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('sequence_quality not found in FastQC reports')
            return None

        pconfig = {
            'id': 'fastqc_per_base_sequence_quality_plot',
            'title': 'FastQC: Mean Quality Scores',
            'ylab': 'Phred Score',
            'xlab': 'Position (bp)',
            'ymin': 0,
            'xDecimals': False,
            'tt_label': '<b>Base {point.x}</b>: {point.y:.2f}',
            'colors': self.get_status_cols('per_base_sequence_quality'),
            'yPlotBands': [
                {'from': 28, 'to': 100, 'color': '#c3e6c3'},
                {'from': 20, 'to': 28, 'color': '#e6dcc3'},
                {'from': 0, 'to': 20, 'color': '#e6c3c3'},
            ]
        }
        self.add_section (
            name = 'Sequence Quality Histograms',
            anchor = 'fastqc_per_base_sequence_quality',
            description = 'The mean quality value across each base position in the read. ' +
                        'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/2%20Per%20Base%20Sequence%20Quality.html" target="_blank">FastQC help</a>.',
            plot = linegraph.plot(data, pconfig)
        )
Example #15
0
 def _bcbio_umi_count_plot(self, parsed_data):
     plot_data = {}
     for s, info in parsed_data.items():
         plot_data[s] = info["umi_counts"]
     config = {'xlab': "Reads per UMI", 'ylab': "Count",
               "xDecimals": False}
     return {'name': 'UMI count distribution',
             'anchor': 'umi-stats-counts',
             'plot': linegraph.plot([plot_data], config)}
Example #16
0
def plot_bhist(samples, file_type, **plot_args):
    """ Create line graph plot of histogram data for BBMap 'bhist' output.

    The 'samples' parameter could be from the bbmap mod_data dictionary:
    samples = bbmap.MultiqcModule.mod_data[file_type]
    """

    all_x = set()
    for item in sorted(chain(*[samples[sample]['data'].items()
                                for sample in samples])):
        all_x.add(item[0])

    columns_to_plot = {
        'GC': {
            1: 'C',
            2: 'G',
        },
        'AT': {
            0: 'A',
            3: 'T',
        },
        'N': {
            4: 'N'
        },
    }
    nucleotide_data = []
    for column_type in columns_to_plot:
        nucleotide_data.append(
            {
                sample+'.'+column_name: {
                    x: samples[sample]['data'][x][column]*100 if x in samples[sample]['data'] else 0
                    for x in all_x
            }
            for sample in samples
            for column, column_name in columns_to_plot[column_type].items()
        }
    )

    plot_params = {
            'id': 'bbmap-' + file_type + '_plot',
            'title': 'BBTools: ' + plot_args['plot_title'],
            'xlab': 'Read position',
            'ymin': 0,
            'ymax': 100,
            'data_labels': [
                {'name': 'Percentage of G+C bases'},
                {'name': 'Percentage of A+T bases'},
                {'name': 'Percentage of N bases'},
            ]
    }
    plot_params.update(plot_args['plot_params'])
    plot = linegraph.plot(
        nucleotide_data,
        plot_params
    )

    return plot
Example #17
0
    def parse_plotFingerprint(self):
        """Find plotFingerprint output. Both --outQualityMetrics and --outRawCounts"""
        self.deeptools_plotFingerprintOutQualityMetrics = dict()
        for f in self.find_log_files('deeptools/plotFingerprintOutQualityMetrics'):
            parsed_data = self.parsePlotFingerprintOutQualityMetrics(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotFingerprintOutQualityMetrics:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotFingerprintOutQualityMetrics[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='plotFingerprint')

        self.deeptools_plotFingerprintOutRawCounts= dict()
        for f in self.find_log_files('deeptools/plotFingerprintOutRawCounts'):
            parsed_data = self.parsePlotFingerprintOutRawCounts(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotFingerprintOutRawCounts:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotFingerprintOutRawCounts[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='plotFingerprint')

        if len(self.deeptools_plotFingerprintOutQualityMetrics) > 0:
            config = dict(ymin=0.0, ymax=1.0, ylab='Value', categories=True)
            config['id'] = 'plotFingerprint_quality_metrics'
            config['title'] = 'Fingerprint quality metrics'
            self.add_section(name="Fingerprint quality metrics",
                             anchor="plotFingerprint",
                             description="Various quality metrics returned by plotFingerprint",
                             plot=linegraph.plot(self.deeptools_plotFingerprintOutQualityMetrics, config))

        if len(self.deeptools_plotFingerprintOutRawCounts) > 0:
            config = dict(xmin=0.0, xmax=1.0, ymin=0.0, ymax=1.0, xlab='rank', ylab='Fraction w.r.t. bin with highest coverage')
            config['id'] = 'deeptools_fingerprint_plot'
            config['title'] = 'Fingerprint'
            self.add_section(name="Fingerprint",
                             anchor="deeptools_fingerprint",
                             description="Signal fingerprint according to plotFingerprint",
                             plot=linegraph.plot(self.deeptools_plotFingerprintOutRawCounts, config))

        return len(self.deeptools_plotFingerprintOutQualityMetrics), len(self.deeptools_plotFingerprintOutRawCounts)
Example #18
0
    def chart_align_mapq(self):

        # fraction of optimally mapped reads
        pd = {}
        for sid, dd in self.mdata['align_mapq'].items():
            pd[sid] = {'OAligned':0, 'SAligned':0, 'UAligned':1}
            for mapq, cnt in dd.items():
                if mapq == 'unmapped':
                    pd[sid]['UAligned'] += int(cnt)
                elif int(mapq) >= 40:
                    pd[sid]['OAligned'] += int(cnt)
                else:
                    pd[sid]['SAligned'] += int(cnt)

        self.add_section(
            name = 'Mapping Summary',
            anchor = 'biscuit-mapping',
            description = 'This shows the fraction of optimally aligned reads, which is defined by mapQ >= 40.',
            helptext = 'A good library should have high fraction of reads optimally aligned. Suboptimally aligned reads include both nonunique alignments and imperfect alignments.',
            plot = bargraph.plot(pd, OrderedDict([
                ('OAligned', {'name':'Optimally Aligned Reads'}),
                ('SAligned', {'name':'Suboptimally Aligned Reads'}),
                ('UAligned', {'name':'Unaligned Reads'})
            ]), {'id':'biscuit_mapping_summary',
                 'title':'BISCUIT: Mapping Summary',
                 'ylab':'Number of Reads',
                 'cpswitch_counts_label': '# Reads'
            })
        )

        # Mapping quality together in one plot
        total = {}
        for sid, dd in self.mdata['align_mapq'].items():
            total[sid] = sum([int(cnt) for _, cnt in dd.items() if _ != "unmapped"])

        pd_mapping = {}
        for sid, dd in self.mdata['align_mapq'].items():
            mapqcnts = []
            for mapq in range(61):
                if str(mapq) in dd:
                    mapqcnts.append(float(dd[str(mapq)])/total[sid]*100)
                else:
                    mapqcnts.append(0)
            pd_mapping[sid] = dict(zip(range(61), mapqcnts))

        self.add_section(
            name = 'Mapping Quality Distribution',
            anchor = 'biscuit-mapq',
            description = "This plot shows the distribution of primary mapping quality.",
            plot = linegraph.plot(pd_mapping,
                {'id':'biscuit_mapping',
                 'title': 'BISCUIT: Mapping Information', 
                 'ymin': 0, 'yLabelFormat': '{value}%', 
                 'tt_label': '<strong>Q{point.x}:</strong> {point.y:.2f}% of reads',
                 'name':'Mapping Quality', 'ylab': '% Primary Mapped Reads','xlab': 'Mapping Quality'}))
Example #19
0
    def slamdunkTcPerUTRPosPlot (self):
        """ Generate the tc per UTR pos plots """

        pconfig_nontc = {
            'id': 'slamdunk_slamdunk_nontcperutrpos_plot',
            'title': 'Slamdunk: Non-T>C mutations over 3\' UTR ends',
            'ylab': 'Percent mismatches %',
            'xlab': 'Position in the static last 250bp window of 3\' UTR',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>Pos {point.x}</b>: {point.y:.2f} %',
            'data_labels': [{'name': 'UTRs on plus strand', 'ylab': 'Percent mismatches %'},
                            {'name': 'UTRs on minus strand', 'ylab': 'Percent mismatches %'}]
        }

        pconfig_tc = {
            'id': 'slamdunk_slamdunk_tcperutrpos_plot',
            'title': 'Slamdunk: T>C conversions over 3\' UTR ends',
            'ylab': 'Percent converted %',
            'xlab': 'Position in the static last 250bp window of 3\' UTR',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>Pos {point.x}</b>: {point.y:.2f} %',
            'data_labels': [{'name': 'UTRs on plus strand', 'ylab': 'Percent converted %'},
                            {'name': 'UTRs on minus strand', 'ylab': 'Percent converted %'}]
        }

        self.add_section (
            name = 'Non T&gt;C mismatches over UTR positions',
            anchor = 'slamdunk_nontcperutrpos',
            description = """This plot shows the distribution of non T&gt;C mismatches across UTR positions for the last 250 bp from the 3\' UTR end
                        (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#tcperutrpos" target="_blank">slamdunk docs</a>).""",
            plot = linegraph.plot([self.nontc_per_utrpos_plus, self.nontc_per_utrpos_minus], pconfig_nontc)
        )

        self.add_section (
            name = 'T&gt;C conversions over UTR positions',
            anchor = 'tcperutrpos',
            description = """This plot shows the distribution of T&gt;C conversions across UTR positions for the last 250 bp from the 3\' UTR end
                        (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#tcperutrpos" target="_blank">slamdunk docs</a>).""",
            plot = linegraph.plot([self.tc_per_utrpos_plus, self.tc_per_utrpos_minus], pconfig_tc)
        )
Example #20
0
    def slamdunkTcPerReadPosPlot (self):
        """ Generate the tc per read pos plots """

        pconfig_nontc = {
            'id': 'slamdunk_nontcperreadpos_plot',
            'title': 'Slamdunk: Non-T>C mismatches over reads',
            'ylab': 'Percent mismatches %',
            'xlab': 'Position in read',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>Pos {point.x}</b>: {point.y:.2f} %',
            'data_labels': [{'name': 'Forward reads +', 'ylab': 'Percent mismatches %'},
                            {'name': 'Reverse reads -', 'ylab': 'Percent mismatches %'}]
        }

        pconfig_tc = {
            'id': 'slamdunk_tcperreadpos_plot',
            'title': 'Slamdunk: T>C conversions over reads',
            'ylab': 'Percent converted %',
            'xlab': 'Position in read',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>Pos {point.x}</b>: {point.y:.2f} %',
            'data_labels': [{'name': 'Forward reads +', 'ylab': 'Percent converted %'},
                            {'name': 'Reverse reads -', 'ylab': 'Percent converted %'}]
        }

        self.add_section (
            name = 'Non T&gt;C mismatches over read positions',
            anchor = 'slamdunk_nontcperreadpos',
            description = """This plot shows the distribution of non T&gt;C mismatches across read positions
                        (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#tcperreadpos" target="_blank">slamdunk docs</a>).""",
            plot = linegraph.plot([self.nontc_per_readpos_plus, self.nontc_per_readpos_minus], pconfig_nontc)
        )

        self.add_section (
            name = 'T&gt;C conversions over read positions',
            anchor = 'slamdunk_tcperreadpos',
            description = """This plot shows the distribution of T&gt;C conversions across read positions
                        (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#tcperreadpos" target="_blank">slamdunk docs</a>).""",
            plot = linegraph.plot([self.tc_per_readpos_plus, self.tc_per_readpos_minus], pconfig_tc)
        )
Example #21
0
    def length_dist_chart (self):

        """ Make the tagLengthDistribution plot """

        pconfig = {
            'id': 'tagLengthDistribution',
            'cpswitch': True,
            'title': 'Tag Length Distribution',
            'ylab': 'Fraction of Tags',
            'xlab': 'Tag Length (bp)'
        }
        return linegraph.plot(self.tagdir_data['length'], pconfig)
Example #22
0
def plot_idhist(samples, file_type, **plot_args):
    """ Create line graph plot of histogram data for BBMap 'idhist' output.

    The 'samples' parameter could be from the bbmap mod_data dictionary:
    samples = bbmap.MultiqcModule.mod_data[file_type]
    """

    all_x = set()
    for item in sorted(chain(*[samples[sample]['data'].items()
                                for sample in samples])):
        all_x.add(item[0])


    columns_to_plot = {
        'Reads': {
            0: 'Count',
        },
        'Bases': {
            1: 'Count',
        }
    }

    plot_data = []
    for column_type in columns_to_plot:
        plot_data.append(
            {
                sample+'.'+column_name: {
                    x: samples[sample]['data'][x][column] if x in samples[sample]['data'] else 0
                    for x in all_x
                }
                for sample in samples
                for column, column_name in columns_to_plot[column_type].items()
            }
        )

    plot_params = {
            'id': 'bbmap-' + file_type,
            'title': 'BBTools: ' + plot_args['plot_title'],
            'xlab': 'Percent identity',
            'ylab': 'Read count',
            'data_labels': [
                {'name': 'Reads', 'ylab': 'Read count'},
                {'name': 'Bases', 'ylab': 'Number of bases'},
            ]

    }
    plot_params.update(plot_args['plot_params'])
    plot = linegraph.plot(
        plot_data,
        plot_params
    )

    return plot
Example #23
0
    def bcbio_coverage_avg_chart(self, names):
        """ Make the bcbio assignment rates plot """

        x_threshold = 0
        data = defaultdict(dict)
        avgdata = defaultdict(dict)
        for f in self.find_log_files(names):
            s_name = self.clean_s_name(f['fn'], root=None)
            for line in f['f'].split("\n"):
                if "\t" not in line:
                    continue
                contig, cutoff_reads, bases_fraction = line.split("\t")
                if not contig == "total":
                    avg = avgdata[s_name].get(contig, 0) + float(bases_fraction)
                    avgdata[s_name][contig] = avg
                y = 100.0 * float(bases_fraction)
                x = int(cutoff_reads)
                data[s_name][x] = y
                if y > 1.0:
                    x_threshold = max(x_threshold, x)

            if s_name in data:
                self.add_data_source(f)

        plots = {}
        if data:
            plots["coverage_avg_chart"] = linegraph.plot(data, {
                'id': "coverage_avg_chart",
                'xlab': 'Coverage (X)',
                "ylab": '% bases in genome or rarget covered by least X reads',
                'ymax': 100,
                "xmax": x_threshold,
            })
            plots["coverage_avg_per_contig_plot"] = linegraph.plot(avgdata, {
                'id': "coverage_avg_per_contig_plot",
                'xlab': 'region',
                'ylab': 'average coverage',
                'categories': True
            })
        return plots
Example #24
0
    def qualities_plot(self):
        """ Generate the qualities plot """

        pconfig = {
            'smooth_points': 200,
            'id': 'snpeff_qualities',
            'title': 'SnpEff: Qualities',
            'ylab': 'Count',
            'xlab': 'Values',
            'xDecimals': False,
            'ymin': 0
        }

        return linegraph.plot(self.snpeff_qualities, pconfig)
Example #25
0
    def roc_plot(self):
        helptext = 'Lower coverage samples have shorter curves where the proportion of regions covered \n\
        drops off more quickly. This indicates a higher fraction of low coverage regions.'
        max_chroms = 50
        data = collections.defaultdict(lambda: collections.defaultdict(dict))
        for fn in self.find_log_files('goleft_indexcov/roc', filehandles=True):
            header = fn['f'].readline()
            sample_names = [self.clean_s_name(x, fn["root"]) for x in header.strip().split()[2:]]
            for parts in (l.rstrip().split() for l in fn['f']):
                if len(parts) > 2:
                    chrom, cov = parts[:2]
                    sample_vals = parts[2:]
                    if self._short_chrom(chrom) is not None:
                        for val, sample in zip(sample_vals, sample_names):
                            data[chrom][sample][float(cov)] = float(val)

        # Filter to strip out ignored sample names
        for chrom in data:
            data[chrom] = self.ignore_samples(data[chrom])

        if data:
            def to_padded_str(x):
                x = self._short_chrom(x)
                try:
                    return "%06d" % x
                except TypeError:
                    return x
            chroms = sorted(data.keys(), key=to_padded_str)
            log.info("Found goleft indexcov ROC reports for %s samples" % (len(data[chroms[0]])))
            if len(chroms) > max_chroms:
                log.info("Too many chromosomes found: %s, limiting to %s" % (len(chroms), max_chroms))
                chroms = chroms[:max_chroms]
            pconfig = {
                'id': 'goleft_indexcov-roc-plot',
                'title': 'goleft indexcov: ROC - genome coverage per scaled depth by chromosome',
                'xlab': 'Scaled coverage',
                'ylab': 'Proportion of regions covered',
                'ymin': 0, 'ymax': 1.0,
                'xmin': 0, 'xmax': 1.5,
                'data_labels': [{"name": self._short_chrom(c)} for c in chroms]}
            self.add_section (
                name = 'Scaled coverage ROC plot',
                anchor = 'goleft_indexcov-roc',
                description = 'Coverage (ROC) plot that shows genome coverage at at given (scaled) depth.',
                helptext = helptext,
                plot = linegraph.plot([data[c] for c in chroms], pconfig)
            )
            return True
        else:
            return False
Example #26
0
    def adapter_removal_length_dist_plot(self):

        pconfig = {
            'title': 'Adapter Removal: Length Distribution',
            'id': 'ar_length_count_plot',
            'ylab': 'Counts',
            'xlab': 'read length',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
            'data_labels': None
        }

        lineplot_data = [
            self.len_dist_plot_data['all'],
            self.len_dist_plot_data['mate1']
        ]
        data_labels = [
            {'name': 'All', 'ylab': 'Count'},
            {'name': 'Mate1', 'ylab': 'Count'},
        ]
        if self.__any_paired:
            lineplot_data.extend([
                self.len_dist_plot_data['mate2'],
                self.len_dist_plot_data['singleton']
            ])
            data_labels.extend([
                {'name': 'Mate2', 'ylab': 'Count'},
                {'name': 'Singleton', 'ylab': 'Count'},
            ])
            if self.__any_collapsed:
                lineplot_data.extend([
                    self.len_dist_plot_data['collapsed'],
                    self.len_dist_plot_data['collapsed_truncated']
                ])
                data_labels.extend([
                    {'name': 'Collapsed', 'ylab': 'Count'},
                    {'name': 'Collapsed Truncated', 'ylab': 'Count'}
                ])
        lineplot_data.append(self.len_dist_plot_data['discarded'])
        data_labels.append({'name': 'Discarded', 'ylab': 'Count'})

        pconfig['data_labels'] = data_labels

        self.add_section(
            name='Length Distribution Paired End Collapsed',
            anchor='ar_length_count',
            description='The length distribution of reads after processing adapter alignment.',
            plot=linegraph.plot(lineplot_data, pconfig)
        )
Example #27
0
    def n_content_plot (self):
        """ Create the HTML for the per base N content plot """

        data = dict()
        for s_name in self.fastqc_data:
            try:
                data[s_name] = {self.avg_bp_from_range(d['base']): d['n-count'] for d in self.fastqc_data[s_name]['per_base_n_content']}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('per_base_n_content not found in FastQC reports')
            return None

        pconfig = {
            'id': 'fastqc_per_base_n_content_plot',
            'title': 'FastQC: Per Base N Content',
            'ylab': 'Percentage N-Count',
            'xlab': 'Position in Read (bp)',
            'yCeiling': 100,
            'yMinRange': 5,
            'ymin': 0,
            'xmin': 0,
            'xDecimals': False,
            'colors': self.get_status_cols('per_base_n_content'),
            'tt_label': '<b>Base {point.x}</b>: {point.y:.2f}%',
            'yPlotBands': [
                {'from': 20, 'to': 100, 'color': '#e6c3c3'},
                {'from': 5, 'to': 20, 'color': '#e6dcc3'},
                {'from': 0, 'to': 5, 'color': '#c3e6c3'},
            ]
        }

        self.add_section (
            name = 'Per Base N Content',
            anchor = 'fastqc_per_base_n_content',
            description = 'The percentage of base calls at each position for which an `N` was called.',
            helptext = '''
            From the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/6%20Per%20Base%20N%20Content.html):

            _If a sequencer is unable to make a base call with sufficient confidence then it will
            normally substitute an `N` rather than a conventional base call. This graph shows the
            percentage of base calls at each position for which an `N` was called._

            _It's not unusual to see a very low proportion of Ns appearing in a sequence, especially
            nearer the end of a sequence. However, if this proportion rises above a few percent
            it suggests that the analysis pipeline was unable to interpret the data well enough to
            make valid base calls._
            ''',
            plot = linegraph.plot(data, pconfig)
        )
Example #28
0
 def freqpoly_plot(data):
     """make freqpoly plot of merged read lengths"""
     rel_data = OrderedDict()
     for key, val in data.items():
         tot = sum(val.values(), 0)
         rel_data[key] = {k: v / tot for k, v in val.items()}
     fplotconfig = {
         'data_labels': [
             {'name': 'Absolute', 'ylab': 'Frequency', 'xlab': 'Merged Read Length'},
             {'name': 'Relative', 'ylab': 'Relative Frequency', 'xlab': 'Merged Read Length'}
             ],
         'id': 'flash_freqpoly_plot', 'title': 'FLASh: Frequency of merged read lengths',
         'colors': dict(zip(data.keys(), MultiqcModule.get_colors(len(data))))
         }
     return linegraph.plot([data, rel_data], fplotconfig)
Example #29
0
    def seq_length_dist_plot (self):
        """ Create the HTML for the Sequence Length Distribution plot """

        data = dict()
        seq_lengths = set()
        multiple_lenths = False
        for s_name in self.fastqc_data:
            try:
                data[s_name] = {self.avg_bp_from_range(d['length']): d['count'] for d in self.fastqc_data[s_name]['sequence_length_distribution']}
                seq_lengths.update(data[s_name].keys())
                if len(set(data[s_name].keys())) > 1:
                    multiple_lenths = True
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('sequence_length_distribution not found in FastQC reports')
            return None

        if not multiple_lenths:
            lengths = 'bp , '.join([str(l) for l in list(seq_lengths)])
            desc = 'All samples have sequences of a single length ({}bp).'.format(lengths)
            if len(seq_lengths) > 1:
                desc += ' See the <a href="#general_stats">General Statistics Table</a>.'
            self.add_section (
                name = 'Sequence Length Distribution',
                anchor = 'fastqc_sequence_length_distribution',
                description = '<div class="alert alert-info">{}</div>'.format(desc)
            )
        else:
            pconfig = {
                'id': 'fastqc_sequence_length_distribution_plot',
                'title': 'FastQC: Sequence Length Distribution',
                'ylab': 'Read Count',
                'xlab': 'Sequence Length (bp)',
                'ymin': 0,
                'yMinTickInterval': 0.1,
                'xDecimals': False,
                'colors': self.get_status_cols('sequence_length_distribution'),
                'tt_label': '<b>{point.x} bp</b>: {point.y}',
            }
            desc =  'The distribution of fragment sizes (read lengths) found. \
                See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/7%20Sequence%20Length%20Distribution.html" target="_blank">FastQC help</a>.'
            self.add_section (
                name = 'Sequence Length Distribution',
                anchor = 'fastqc_sequence_length_distribution',
                description = desc,
                plot = linegraph.plot(data, pconfig)
            )
Example #30
0
 def add_readlen_dist_plot(self):
     """ Generate plot HTML for read length distribution plot. """
     pconfig = {
         'id': 'skewer_read_length_histogram',
         'title': 'Skewer: Read Length Distribution after trimming',
         'xDecimals': False,
         'ylab': '% of Reads',
         'xlab': 'Read Length',
         'xmin': 0,
         'ymax': 100,
         'ymin': 0,
         'tt_label': '<b>{point.x}</b>: {point.y:.1f}%',
     }
     self.add_section(
         plot = linegraph.plot(self.skewer_readlen_dist, pconfig)
     )
Example #31
0
	def linegraph(self, json):

		# config dictionary for "density" plots. Its a work in progress. 
		config = {'title': "HTStream: Overlapped Lengths",
				  'ylab': "Counts", "xlab": "Overlap Lengths"}

		# initialize data structures
		multi_line = {}

		for key in json.keys():

			# creates empty dictionary to hold data for line graph. 
			multi_line[key] = {}

			# iterates over ever value in histogram and adds it to line graph
			for item in json[key]["Ov_Histogram"]:

				multi_line[key][item[0]] = item[1]



		return linegraph.plot(multi_line, config)
Example #32
0
    def bismark_mbias_plot(self):
        """ Make the M-Bias plot """

        description = '<p>This plot shows the average percentage methylation and coverage across reads. See the \n\
        <a href="https://rawgit.com/FelixKrueger/Bismark/master/Docs/Bismark_User_Guide.html#m-bias-plot" target="_blank">bismark user guide</a> \n\
        for more information on how these numbers are generated.</p>'

        pconfig = {
            "id": "bismark_mbias",
            "title": "Bismark: M-Bias",
            "ylab": "% Methylation",
            "xlab": "Position (bp)",
            "xDecimals": False,
            "ymax": 100,
            "ymin": 0,
            "tt_label": "<b>{point.x} bp</b>: {point.y:.1f}%",
            "data_labels": [
                {"name": "CpG R1", "ylab": "% Methylation", "ymax": 100},
                {"name": "CHG R1", "ylab": "% Methylation", "ymax": 100},
                {"name": "CHH R1", "ylab": "% Methylation", "ymax": 100},
            ],
        }
        datasets = [
            self.bismark_mbias_data["meth"]["CpG_R1"],
            self.bismark_mbias_data["meth"]["CHG_R1"],
            self.bismark_mbias_data["meth"]["CHH_R1"],
        ]

        if len(self.bismark_mbias_data["meth"]["CpG_R2"]) > 0:
            pconfig["data_labels"].append({"name": "CpG R2", "ylab": "% Methylation", "ymax": 100})
            pconfig["data_labels"].append({"name": "CHG R2", "ylab": "% Methylation", "ymax": 100})
            pconfig["data_labels"].append({"name": "CHH R2", "ylab": "% Methylation", "ymax": 100})
            datasets.append(self.bismark_mbias_data["meth"]["CpG_R2"])
            datasets.append(self.bismark_mbias_data["meth"]["CHG_R2"])
            datasets.append(self.bismark_mbias_data["meth"]["CHH_R2"])

        self.add_section(
            name="M-Bias", anchor="bismark-mbias", description=description, plot=linegraph.plot(datasets, pconfig)
        )
Example #33
0
    def parse_groupreadsbyumi_plot(self):
        config = {
            'id':
            'fgbio-groupreadsbyumi-plot',
            'title':
            'fgbio: Family size count',
            'ylab':
            'Number of UMIs',
            'xlab':
            'Reads supporting UMI',
            'xmax':
            15,
            'xDecimals':
            False,
            'data_labels': [{
                'name': 'Counts',
                'ylab': 'Number of UMIs'
            }, {
                'name': 'Percentages',
                'ylab': 'Percentage of sample'
            }]
        }

        self.add_section(
            name='GroupReadsByUmi statistics',
            anchor='fgbio-groupreadsbyumi',
            description=
            '''During `GroupReadsByUmi` processing, family size count data is generated,
                             showing number of UMIs represented by a certain number of reads.''',
            helptext='''
            This tool groups reads together that appear to have come from the same original molecule.
            Reads are grouped by template, and then templates are sorted by the 5' mapping positions
            of the reads from the template, used from earliest mapping position to latest.
            Reads that have the same end positions are then sub-grouped by UMI sequence.

            The histogram shows tag family size counts or percentages.
            ''',
            plot=linegraph.plot(
                [self.fgbio_umi_data, self.fgbio_umi_data_normed], config))
Example #34
0
def parse_reports(self):
    """Find Picard QualityScoreDistribution reports and parse their data"""

    headers = ["QUALITY", "COUNT_OF_Q"]
    formats = [int, int]
    all_data = read_histogram(self, "picard/quality_score_distribution",
                              "QualityScoreDistribution", headers, formats)

    if not all_data:
        return 0

    # Write parsed data to a file
    self.write_data_file(all_data, "multiqc_picard_quality_score_distribution")

    # Plot the data and add section
    pconfig = {
        "id": "picard_quality_score_distribution",
        "title": "Picard: Base Quality Distribution",
        "ylab": "Number of Bases",
        "xlab": "Base Quality Score",
        "xDecimals": False,
        "tt_label": "<b>base quality{point.x}</b>: {point.y}",
        "ymin": 0,
    }

    lg = {}
    for s_name in all_data:
        lg[s_name] = OrderedDict((qual, data["COUNT_OF_Q"])
                                 for qual, data in all_data[s_name].items())

    self.add_section(
        name="Base Quality Distribution",
        anchor="picard-quality-score-distribution",
        description="Plot shows the count of each base quality score.",
        plot=linegraph.plot([lg], pconfig),
    )

    # Return the number of detected samples to the parent module
    return len(all_data)
Example #35
0
def _add_target_bases(data):
    subtitle = "The percentage of all target bases with at least <code>x</code> fold coverage."
    data_clean = defaultdict(dict)
    for s in data:
        for h in data[s]:
            if h.startswith("PCT_TARGET"):
                data_clean[s][int(h.replace("PCT_TARGET_BASES_", "")[:-1])] = data[s][h] * 100.0

    pconfig = { 'id': 'picard_percentage_target_bases',
                'title': 'Percentage of target bases',
                'xlab': 'Fold Coverage',
                'ylab': 'Pct of bases',
                'ymax': 100,
                'ymin': 0,
                'xmin': 0,
                'tt_label': '<b>{point.x}X</b>: {point.y:.2f}%',}
    return {
        'name': 'Target Region Coverage',
        'anchor': 'picard_hsmetrics_target_bases',
        'description': subtitle,
        'plot' : linegraph.plot(data_clean, pconfig)
    }
Example #36
0
    def n_content_plot (self):
        """ Create the HTML for the per base N content plot """

        data = dict()
        for s_name in self.fastqc_data:
            try:
                data[s_name] = {self.avg_bp_from_range(d['base']): d['n-count'] for d in self.fastqc_data[s_name]['per_base_n_content']}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('per_base_n_content not found in FastQC reports')
            return None

        pconfig = {
            'id': 'fastqc_per_base_n_content_plot',
            'title': 'Per Base N Content',
            'ylab': 'Percentage N-Count',
            'xlab': 'Position in Read (bp)',
            'yCeiling': 100,
            'yMinRange': 5,
            'ymin': 0,
            'xmin': 0,
            'xDecimals': False,
            'colors': self.get_status_cols('per_base_n_content'),
            'tt_label': '<b>Base {point.x}</b>: {point.y:.2f}%',
            'yPlotBands': [
                {'from': 20, 'to': 100, 'color': '#e6c3c3'},
                {'from': 5, 'to': 20, 'color': '#e6dcc3'},
                {'from': 0, 'to': 5, 'color': '#c3e6c3'},
            ]
        }

        self.sections.append({
            'name': 'Per Base N Content',
            'anchor': 'fastqc_per_base_n_content',
            'content': '<p>The percentage of base calls at each position for which an N was called. ' +
                        'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/6%20Per%20Base%20N%20Content.html" target="_bkank">FastQC help</a>.</p>' +
                        linegraph.plot(data, pconfig)
        })
Example #37
0
    def seq_dup_levels_plot (self):
        """ Create the HTML for the Sequence Duplication Levels plot """

        data = dict()
        for s_name in self.fastqc_data:
            try:
                d = {d['duplication_level']: d['percentage_of_total'] for d in self.fastqc_data[s_name]['sequence_duplication_levels']}
                data[s_name] = OrderedDict()
                for k in self.dup_keys:
                    try:
                        data[s_name][k] = d[k]
                    except KeyError:
                        pass
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('sequence_length_distribution not found in FastQC reports')
            return None

        pconfig = {
            'id': 'fastqc_sequence_duplication_levels_plot',
            'title': 'Sequence Duplication Levels',
            'categories': True,
            'ylab': '% of Library',
            'xlab': 'Sequence Duplication Level',
            'ymax': 100,
            'ymin': 0,
            'yMinTickInterval': 0.1,
            'colors': self.get_status_cols('sequence_duplication_levels'),
            'tt_label': '<b>{point.x}</b>: {point.y:.1f}%',
        }

        self.sections.append({
            'name': 'Sequence Duplication Levels',
            'anchor': 'fastqc_sequence_duplication_levels',
            'content': '<p>The relative level of duplication found for every sequence. ' +
                        'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html" target="_bkank">FastQC help</a>.</p>' +
                        linegraph.plot(data, pconfig)
        })
Example #38
0
    def cutadapt_length_trimmed_plot(self):
        """ Generate the trimming length plot """

        description = 'This plot shows the number of reads with certain lengths of adapter trimmed. \n\
        Obs/Exp shows the raw counts divided by the number expected due to sequencing errors. A defined peak \n\
        may be related to adapter length. See the \n\
        <a href="http://cutadapt.readthedocs.org/en/latest/guide.html#how-to-read-the-report" target="_blank">cutadapt documentation</a> \n\
        for more information on how these numbers are generated.'

        pconfig = {
            'id':
            'cutadapt_plot',
            'title':
            'Cutadapt: Lengths of Trimmed Sequences',
            'ylab':
            'Counts',
            'xlab':
            'Length Trimmed (bp)',
            'xDecimals':
            False,
            'ymin':
            0,
            'tt_label':
            '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
            'data_labels': [{
                'name': 'Counts',
                'ylab': 'Count'
            }, {
                'name': 'Obs/Exp',
                'ylab': 'Observed / Expected'
            }]
        }

        self.add_section(
            description=description,
            plot=linegraph.plot(
                [self.cutadapt_length_counts, self.cutadapt_length_obsexp],
                pconfig))
def parse_reports(self):
    """ Find Picard QualityScoreDistribution reports and parse their data """

    headers = ['QUALITY', 'COUNT_OF_Q']
    formats = [int, int]
    all_data = read_histogram(self, 'picard/quality_score_distribution',
                              'QualityScoreDistribution', headers, formats)

    if not all_data:
        return 0

    # Write parsed data to a file
    self.write_data_file(all_data, 'multiqc_picard_quality_score_distribution')

    # Plot the data and add section
    pconfig = {
        'id': 'picard_quality_score_distribution',
        'title': 'Picard: Base Quality Distribution',
        'ylab': 'Number of Bases',
        'xlab': 'Base Quality Score',
        'xDecimals': False,
        'tt_label': '<b>base quality{point.x}</b>: {point.y}',
        'ymin': 0,
    }

    lg = {}
    for s_name in all_data:
        lg[s_name] = OrderedDict((qual, data['COUNT_OF_Q'])
                                 for qual, data in all_data[s_name].items())

    self.add_section(
        name='Base Quality Distribution',
        anchor='picard-quality-score-distribution',
        description='Plot shows the count of each base quality score.',
        plot=linegraph.plot([lg], pconfig))

    # Return the number of detected samples to the parent module
    return len(all_data)
Example #40
0
    def add_rna_transcript_coverage(self):
        data_by_sample = defaultdict(dict)

        for f in self.find_log_files("dragen/rna_transcript_cov"):
            data = parse_rna_transcript_cov(f)
            if f["s_name"] in data_by_sample:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f["s_name"]))
            self.add_data_source(f, section="stats")
            data_by_sample[f["s_name"]] = data

        # Filter to strip out ignored sample names:
        data_by_sample = self.ignore_samples(data_by_sample)

        if not data_by_sample:
            return set()

        self.add_section(
            name="RNA Transcript Coverage",
            anchor="rna-transcript-cov",
            description="""
            RNA transcript coverage.  This is the average coverage at the position along the transcripts.
            """,
            plot=linegraph.plot(
                data_by_sample,
                pconfig={
                    "id": "dragen_rna_transcript_cov",
                    "title": "Dragen: RNA Transcript Coverage",
                    "ylab": "Average coverage",
                    "xlab": "Transcript position",
                    "categories": True,
                    "tt_label": "<b>{point.x}</b>: {point.y:.1f}x",
                },
            ),
        )

        return data_by_sample.keys()
Example #41
0
    def quant_sf(self):
        """
        Computes GC Bias
        Input : Reads 3 lists corresponding to observed and 3 for expected
        Calculates the weighted sum for obs, exp
        Output : For proper X-Axis labels, scales the keys in ratio dict
        """

        for f in self.find_log_files('salmon/fld'):
            if os.path.basename(f['root']) == 'libParams':
                path = os.path.abspath(f['root'])
                path_mod = path[:-10]
                s_name = path

                quantSFModel = QuantSFModel()
                quantSFModel.from_file(path_mod)

                quantSF_ratio = quantSFModel.ratios

                ratio = OrderedDict()
                for i in range(len(quantSF_ratio)):
                    ratio[i] = quantSF_ratio[i]

                self.salmon_quant_sf[s_name] = ratio
                self.add_data_source(f, s_name)

        pconfig_quant = {
            'smooth_points': 500,
            'id': 'salmon_plot',
            'title': 'Salmon: QuantSF Distribution',
            'ylab': 'Ratio of Actual to Effective Transcript Length',
            'xlab': 'Bins',
            'ymin': 0,
            'xmin': 0,
            'tt_label': '<b>{point.x:,.0f} </b>: {point.y:,.3f}'
        }
        if len(self.salmon_quant_sf) > 0:
            self.add_section(plot = linegraph.plot(self.salmon_quant_sf, pconfig_quant))
 def FreqDist_chart(self):
     """ Make the petag.FreqDistribution_1000 plot """
     # Take a log of the data before plotting so that we can
     # reduce the number of points to plot evenly
     pdata = {}
     for idx, s_name in enumerate(self.tagdir_data['FreqDistribution']):
         pdata[s_name] = {}
         for x, y in self.tagdir_data['FreqDistribution'][s_name].items():
             try:
                 pdata[s_name][math.log(float(x))] = y
             except ValueError:
                 pass
     pconfig = {
         'id': 'FreqDistribution',
         'title': 'Frequency Distribution',
         'ylab': 'Fraction of Reads',
         'xlab': 'Log10(Distance between regions)',
         'data_labels': ['Reads', 'Percent'],
         'smooth_points': 500,
         'smooth_points_sumcounts': False,
         'yLog': True
     }
     return linegraph.plot(pdata, pconfig)
Example #43
0
def hs_penalty_plot(data):
    data_clean = defaultdict(dict)
    any_non_zero = False
    for s in data:
        for h in data[s]:
            if h.startswith("HS_PENALTY"):
                data_clean[s][int(h.lstrip('HS_PENALTY_').rstrip('X'))] = data[s][h]
                if data[s][h] > 0:
                    any_non_zero = True

    pconfig = {
        'id': 'picard_hybrid_selection_penalty',
        'title': 'Picard: Hybrid Selection Penalty',
        'xlab': 'Fold Coverage',
        'ylab': 'Penalty',
        'ymin': 0,
        'xmin': 0,
        'xDecimals': False,
        'tt_label': '<b>{point.x}X</b>: {point.y:.2f}%'
    }

    if any_non_zero:
        return linegraph.plot(data_clean, pconfig)
Example #44
0
    def rsem_multimapping_plot(self):
        """Make a line plot showing the multimapping levels"""

        pconfig = {
            "id": "rsem_multimapping_rates",
            "title": "RSEM: Multimapping Rates",
            "ylab": "Counts",
            "xlab": "Number of alignments",
            "xDecimals": False,
            "ymin": 0,
            "tt_label": "<b>{point.x} alignments</b>: {point.y:.0f}",
        }

        self.add_section(
            name="Multimapping rates",
            anchor="rsem_multimapping",
            description="A frequency histogram showing how many reads were aligned to `n` reference regions.",
            helptext="""In an ideal world, every sequence reads would align uniquely to a single location in the
                reference. However, due to factors such as repeititve sequences, short reads and sequencing errors,
                reads can be align to the reference 0, 1 or more times. This plot shows the frequency of each factor
                of multimapping. Good samples should have the majority of reads aligning once.""",
            plot=linegraph.plot(self.rsem_multimapping_data, pconfig),
        )
Example #45
0
def hs_penalty_plot(data):
    data_clean = defaultdict(dict)
    any_non_zero = False
    for s in data:
        for h in data[s]:
            if h.startswith("HS_PENALTY"):
                data_clean[s][int(h.lstrip("HS_PENALTY_").rstrip("X"))] = data[s][h]
                if data[s][h] > 0:
                    any_non_zero = True

    pconfig = {
        "id": "picard_hybrid_selection_penalty",
        "title": "Picard: Hybrid Selection Penalty",
        "xlab": "Fold Coverage",
        "ylab": "Penalty",
        "ymin": 0,
        "xmin": 0,
        "xDecimals": False,
        "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%",
    }

    if any_non_zero:
        return linegraph.plot(data_clean, pconfig)
Example #46
0
def _add_target_bases(data):
    data_clean = defaultdict(dict)
    for s in data:
        for h in data[s]:
            if h.startswith("PCT_TARGET"):
                data_clean[s][int(h.replace("PCT_TARGET_BASES_", "")[:-1])] = data[s][h] * 100.0

    pconfig = {
        "id": "picard_percentage_target_bases",
        "title": "Picard: Percentage of target bases",
        "xlab": "Fold Coverage",
        "ylab": "Pct of bases",
        "ymax": 100,
        "ymin": 0,
        "xmin": 0,
        "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%",
    }
    return {
        "name": "Target Region Coverage",
        "anchor": "picard_hsmetrics_target_bases",
        "description": "The percentage of all target bases with at least <code>x</code> fold coverage.",
        "plot": linegraph.plot(data_clean, pconfig),
    }
Example #47
0
    def rsem_multimapping_plot(self):
        """ Make a line plot showing the multimapping levels """

        pconfig = {
            'id': 'rsem_multimapping_rates',
            'title': 'RSEM: Multimapping Rates',
            'ylab': 'Counts',
            'xlab': 'Number of alignments',
            'xDecimals': False,
            'ymin': 0,
            'tt_label': '<b>{point.x} alignments</b>: {point.y:.0f}',
        }

        self.add_section(
            name = 'Multimapping rates',
            anchor = 'rsem_multimapping',
            description = 'A frequency histogram showing how many reads were aligned to `n` reference regions.',
            helptext = '''In an ideal world, every sequence reads would align uniquely to a single location in the
                reference. However, due to factors such as repeititve sequences, short reads and sequencing errors,
                reads can be align to the reference 0, 1 or more times. This plot shows the frequency of each factor
                of multimapping. Good samples should have the majority of reads aligning once.''',
            plot = linegraph.plot(self.rsem_multimapping_data, pconfig)
        )
Example #48
0
 def FreqDist_chart(self):
     """Make the petag.FreqDistribution_1000 plot"""
     # Take a log of the data before plotting so that we can
     # reduce the number of points to plot evenly
     pdata = {}
     for idx, s_name in enumerate(self.tagdir_data["FreqDistribution"]):
         pdata[s_name] = {}
         for x, y in self.tagdir_data["FreqDistribution"][s_name].items():
             try:
                 pdata[s_name][math.log(float(x))] = y
             except ValueError:
                 pass
     pconfig = {
         "id": "FreqDistribution",
         "title": "HOMER: Frequency Distribution",
         "ylab": "Fraction of Reads",
         "xlab": "Log10(Distance between regions)",
         "data_labels": ["Reads", "Percent"],
         "smooth_points": 500,
         "smooth_points_sumcounts": False,
         "yLog": True,
     }
     return linegraph.plot(pdata, pconfig)
Example #49
0
    def per_seq_quality_plot (self):
        """ Create the HTML for the per sequence quality score plot """

        data = dict()
        for s_name in self.fastqc_data:
            try:
                data[s_name] = {d['quality']: d['count'] for d in self.fastqc_data[s_name]['per_sequence_quality_scores']}
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('per_seq_quality not found in FastQC reports')
            return None

        pconfig = {
            'id': 'fastqc_per_sequence_quality_scores_plot',
            'title': 'Per Sequence Quality Scores',
            'ylab': 'Count',
            'xlab': 'Mean Sequence Quality (Phred Score)',
            'ymin': 0,
            'xmin': 0,
            'xDecimals': False,
            'colors': self.get_status_cols('per_sequence_quality_scores'),
            'tt_label': '<b>Phred {point.x}</b>: {point.y} reads',
            'xPlotBands': [
                {'from': 28, 'to': 100, 'color': '#c3e6c3'},
                {'from': 20, 'to': 28, 'color': '#e6dcc3'},
                {'from': 0, 'to': 20, 'color': '#e6c3c3'},
            ]
        }
        self.sections.append({
            'name': 'Per Sequence Quality Scores',
            'anchor': 'fastqc_per_sequence_quality_scores',
            'content': '<p>The number of reads with average quality scores. Shows if a subset of reads has poor quality. ' +
                        'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/3%20Per%20Sequence%20Quality%20Scores.html" target="_bkank">FastQC help</a>.</p>' +
                        linegraph.plot(data, pconfig)
        })
Example #50
0
    def lgdistplot(self, dict_to_use, orientation):
        """Generate a read length distribution plot"""

        data = dict()
        for s_name in dict_to_use:
            try:
                data[s_name] = {
                    int(d): int(dict_to_use[s_name][d])
                    for d in dict_to_use[s_name]
                }
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('No valid data for forward read lgdist input!')
            return None

        config = {
            'id':
            'length-distribution-{}'.format(orientation),
            'title':
            'DamageProfiler: Read length distribution: {} '.format(
                orientation),
            'ylab':
            'Number of reads',
            'xlab':
            'Readlength (bp)',
            'xDecimals':
            False,
            'tt_label':
            '{point.y} reads of length {point.x}',
            'ymin':
            0,
            'xmin':
            0
        }
        return linegraph.plot(data, config)
    def parse_plotEnrichment(self):
        """Find plotEnrichment output."""
        self.deeptools_plotEnrichment = dict()
        for f in self.find_log_files('deeptools/plotEnrichment'):
            parsed_data = self.parsePlotEnrichment(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotEnrichment:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotEnrichment[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='plotEnrichment')

        if len(self.deeptools_plotEnrichment) > 0:
            dCounts = OrderedDict()
            dPercents = OrderedDict()
            for sample, v in self.deeptools_plotEnrichment.items():
                dCounts[sample] = OrderedDict()
                dPercents[sample] = OrderedDict()
                for category, v2 in v.items():
                    dCounts[sample][category] = v2['count']
                    dPercents[sample][category] = v2['percent']
            config = {'data_labels': [
                          {'name': 'Counts in features', 'ylab': 'Counts in feature'},
                          {'name': 'Percents in features', 'ylab': 'Percent of reads in feature'}],
                      'id': 'plotEnrichment',
                      'title': 'Signal enrichment per feature',
                      'ylab': 'Counts in feature',
                      'categories': True,
                      'ymin': 0.0}
            self.add_section(name="Signal enrichment per feature",
                             description="Signal enrichment per feature according to plotEnrichment",
                             anchor="plotEnrichment",
                             plot=linegraph.plot([dCounts, dPercents], pconfig=config))

        return len(self.deeptools_plotEnrichment)
Example #52
0
    def chart_retention_cpg_readpos(self):

        ## retention vs read position
        mdata = [
            dict([(k,v['1']) for k, v in self.mdata['retention_cph_readpos'].items()]),
            dict([(k,v['2']) for k, v in self.mdata['retention_cph_readpos'].items()]),
            dict([(k,v['1']) for k, v in self.mdata['retention_cpg_readpos'].items()]),
            dict([(k,v['2']) for k, v in self.mdata['retention_cpg_readpos'].items()]),
        ]
        self.add_section(
            name = 'Retention vs. Base Position in Read',
            anchor = 'biscuit-retention-cytosine',
            description = "This plot (aka. mbias plot) shows the distribution of cytosine retention rate in read.",
            plot = linegraph.plot(mdata, {
                'id': 'biscuit_retention_cytosine',
                'xlab': 'Position in Read', 'ymin':0, 'ymax':100, 'yMinRange':0, 'yFloor':0,
                'title': 'BISCUIT: Retention vs. Base Position in Read',
                'data_labels': [
                    {'name': 'CpH Read 1', 'ylab': 'CpH Retention Rate (%)', 'ymin':0, 'ymax':100},
                    {'name': 'CpH Read 2', 'ylab': 'CpH Retention Rate (%)', 'ymin':0, 'ymax':100},
                    {'name': 'CpG Read 1', 'ylab': 'CpG Retention Rate (%)', 'ymin':0, 'ymax':100},
                    {'name': 'CpG Read 2', 'ylab': 'CpG Retention Rate (%)', 'ymin':0, 'ymax':100},
                ]})
            )
Example #53
0
    def mirtrace_complexity_plot(self):
        """Generate the miRTrace miRNA Complexity Plot"""

        data = dict()
        for s_name in self.complexity_data:
            try:
                data[s_name] = {
                    int(self.complexity_data[s_name][d]): int(d)
                    for d in self.complexity_data[s_name]
                }
            except KeyError:
                pass
        if len(data) == 0:
            log.debug("No valid data for miRNA complexity")
            return None

        config = {
            "id":
            "mirtrace_complexity_plot",
            "title":
            "miRTrace: miRNA Complexity Plot",
            "ylab":
            "Distinct miRNA Count",
            "xlab":
            "Number of Sequencing Reads",
            "ymin":
            0,
            "xmin":
            1,
            "xDecimals":
            False,
            "tt_label":
            "<b>Number of Sequencing Reads {point.x}</b>: {point.y} Distinct miRNA Count",
        }

        return linegraph.plot(data, config)
Example #54
0
def plot_qchist(samples, file_type, **plot_args):
    """Create line graph plot of histogram data for BBMap 'qchist' output.

    The 'samples' parameter could be from the bbmap mod_data dictionary:
    samples = bbmap.MultiqcModule.mod_data[file_type]
    """

    sumy = sum([int(samples[sample]["data"][x][0]) for sample in samples for x in samples[sample]["data"]])

    cutoff = sumy * 0.999
    all_x = set()
    for item in sorted(chain(*[samples[sample]["data"].items() for sample in samples])):
        all_x.add(item[0])
        cutoff -= item[1][0]
        if cutoff < 0:
            xmax = item[0]
            break
    else:
        xmax = max(all_x)

    data = {
        sample: {x: samples[sample]["data"][x][0] if x in samples[sample]["data"] else 0 for x in all_x}
        for sample in samples
    }
    # Add a count of 0.1 to zero counts, to avoid broken series in log axis
    data = {s: {k: d + 0.1 if d == 0 else d for k, d in v.items()} for s, v in data.items()}

    plot_params = {
        "id": "bbmap-" + file_type + "_plot",
        "title": "BBTools: " + plot_args["plot_title"],
        "xmax": xmax,
    }
    plot_params.update(plot_args["plot_params"])
    plot = linegraph.plot(data, plot_params)

    return plot
Example #55
0
 def preseq_length_trimmed_plot (self):
     """ Generate the preseq plot """
     pconfig = {
         'id': 'preseq_plot',
         'title': 'Preseq complexity curve',
         'ylab': 'Unique {}'.format(self.axis_label),
         'xlab': 'Total {} (including duplicates)'.format(self.axis_label),
         'ymin': 0,
         'xmin': 0,
         'tt_label': '<b>{point.x:,.0f} total</b>: {point.y:,.0f} unique',
         'extra_series': [{
             'name': 'x = y',
             'data': [[0, 0], [self.total_max, self.total_max]],
             'dashStyle': 'Dash',
             'lineWidth': 1,
             'color': '#000000',
             'marker': { 'enabled': False },
             'enableMouseTracking': False,
             'showInLegend': False,
         }]
     }
     return "<p>A shallow curve indicates complexity saturation. The dashed line \
             shows a perfectly complex library where total reads = unique reads.</o>" \
              + linegraph.plot(self.preseq_data, pconfig)
Example #56
0
    def mirtrace_complexity_plot(self):
        """ Generate the miRTrace miRNA Complexity Plot"""

        data = dict()
        for s_name in self.complexity_data:
            try:
                data[s_name] = {
                    int(self.complexity_data[s_name][d]): int(d)
                    for d in self.complexity_data[s_name]
                }
            except KeyError:
                pass
        if len(data) == 0:
            log.debug('No valid data for miRNA complexity')
            return None

        config = {
            'id':
            'mirtrace_complexity_plot',
            'title':
            'miRTrace: miRNA Complexity Plot',
            'ylab':
            'Distinct miRNA Count',
            'xlab':
            'Number of Sequencing Reads',
            'ymin':
            0,
            'xmin':
            1,
            'xDecimals':
            False,
            'tt_label':
            '<b>Number of Sequencing Reads {point.x}</b>: {point.y} Distinct miRNA Count',
        }

        return linegraph.plot(data, config)
Example #57
0
    def lgdistplot(self, dict_to_use, orientation):
        """Generate a read length distribution plot"""

        data = dict()
        for s_name in dict_to_use:
            try:
                data[s_name] = {
                    int(d): int(dict_to_use[s_name][d])
                    for d in dict_to_use[s_name]
                }
            except KeyError:
                pass
        if len(data) == 0:
            log.debug("No valid data for forward read lgdist input!")
            return None

        config = {
            "id":
            "length-distribution-{}".format(orientation),
            "title":
            "DamageProfiler: Read length distribution - {} ".format(
                orientation),
            "ylab":
            "Number of reads",
            "xlab":
            "Readlength (bp)",
            "xDecimals":
            False,
            "tt_label":
            "{point.y} reads of length {point.x}",
            "ymin":
            0,
            "xmin":
            0,
        }
        return linegraph.plot(data, config)
Example #58
0
	def quality_by_cycle(self, json, read):

		# Here is the most complicated figure implementation in this whole module.
		#	The issues here are that MultiQC had limited options for displaying 
		#	multiple figures if its a heatmap. Also, it doesnt allow you to switch
		#	back and forth between figure typs. There are workarounds, however, using
		#	javascript and some clever organizations of javascript.

		title_read = " ".join(read.split("_")[1:3])

		# config dictionary for mean Q score line graph
		line_config = {
				  'smooth_points_sumcounts': False,
				  'categories': True,
				  'title': "HTStream: Mean Quality by Cycle (" + title_read + ")",
				  'xlab': "Cycle",
				  'ylab': "Mean Q Score",
				  }

		# config dictionary for heatmaps
		heat_pconfig = {'id' : "",
				   'title': "HTStream: Quality by Cycle (" + title_read + ")",
				   'yTitle': 'Q Score',
				   'xTitle': 'Cycle',
				   'square' : False,
				   'datalabels': False,
				   'max': 1.0, 
				   'colstops': [
					        [0, '#FFFFFF'],
					        [0.3, '#1DC802'],
					        [0.6, '#F3F943'],
					        [1, '#E70808']
					           ]
    			  }

		btn_id = "-".join(read.split("_")[:3]).lower()

		line_data = {}
		status_dict = {}
		first = True
		button_list = []


		for key in json.keys():

			# create dictionary for line graph. Again, format is {x: y}
			line_data[key] = {}

			# creates unique heatmap id that can be queired later by js.
			heat_pconfig["id"] = "htstream_" + btn_id + "_" + key + "_heatmap"

			# creates x and y axis labels for heatmap (categorical)
			x_lab = json[key][read]["col_names"]
			y_lab = json[key][read]["row_names"][::-1] # reverse orientation makes it easier to cycle through

			data = []

			# create variables for range functions in loops. Represents shape of data
			quality_scores = json[key][read]["shape"][0]
			cycles = json[key][read]["shape"][-1]


			# temp total list 
			total = []
			
			# iterates through positions, creates a list of the sum of scores at each position to be used
			#	to calculated frequency for heatmap. Also, calculates avg. Q score for linegraph.
			#	This chunk of code is very ugly, but is a necessary evil. 

			num_above_q30 = 0

			for pos in range(cycles):
				temp = [ score_list[pos] for score_list in json[key][read]["data"] ]
				temp_sum = sum(temp)
				total.append(temp_sum)

				# multiples count at poistion by Q Score.
				total_score = sum([(int(p) * int(s)) for p, s in zip(temp, y_lab[::-1])])

				# divides sum of total score by the number of cycles for avg fragments
				line_data[key][pos] = total_score / temp_sum # total reads

				if line_data[key][pos] > 30:
					num_above_q30 += 1


			# check to see what percent of bases have a mean Q score of at least 30
			q30_gate = (num_above_q30 / cycles) 

			if q30_gate < 0.6:
				status_dict[key] = "FAIL"

			elif q30_gate < 0.8:
				status_dict[key] = "QUESTIONABLE"

			else:
				status_dict[key] = 'PASS'


			# populates data dictionaries for heatmap
			for score in range(quality_scores - 1, -1, -1):

				# create empty list for data. The format is a little strange, each list represents a position 
				#	the value inside of it is the score at that position divided by the total score for that position
				#	giving a frequency.
				data.append([])

				for pos in range(cycles):
					data[-1].append(json[key][read]["data"][score][pos] / total[pos])


			# if this is the first sample process, lucky them, they get to be shown first and marked as active.
			#	This step is necessary otherwise, the plot div is not initialized. The additional calls to the 
			#	heatmap function are simply to add the data to the internal jsons used by MultiQC.
			if first == True:
				active = "active" # button is default active
				first = False # shuts off first gat
				heatmap_html = heatmap.plot(data, x_lab, y_lab, heat_pconfig)

			else:
				active = "" # button is default off 
				heatmap.plot(data, x_lab, y_lab, heat_pconfig)


			# html div attributes and text
			name = key
			pid = "htstream_" + btn_id + "_" + key + "_btn"

			button_list.append('<button class="btn btn-default btn-sm {a}" onclick="htstream_div_switch(this)" id="{pid}">{n}</button>\n'.format(a=active, pid=pid, n=name))

	
		status_div = htstream_utils.sample_status(status_dict)

		line_plot = linegraph.plot(line_data, line_config)

		html = htstream_utils.qual_by_cycle_html(read, status_div, line_plot, btn_id, button_list, heatmap_html)

		return html
Example #59
0
	def base_by_cycle(self, json, read):

		title_read = " ".join(read.split("_")[1:3])

		# config dictionary for line graph
		config = {'title': "HTStream: Base by Cycle (" + title_read + ")",
				  'data_labels': [],
				  'smooth_points_sumcounts': False,
				  'yCeiling': 100,
				  'categories': True,
				  'colors': {
				  			 "A": "#B62612",
				  			 "C": "#82A7E0",
				  			 "G": "#0B8E0B",
				  			 "T": "#DE7D00",
				  			 "N": "black"
				  			},
				  'yPlotBands': [
								{'from': 0, 'to': 40, 'color': '#c3e6c3'},
								{'from': 40, 'to': 60, 'color': '#e6dcc3'},
								{'from': 60, 'to': 100, 'color': '#e6c3c3'},
								]
				  }

		# initalize data structures and important variables
		data_list = []
		status_dict = {}

		# header read type
		read_header = " ".join(read.split("_")[1:3])

		# section header
		html = '<h4> Base by Cycle: ' + read_header + '</h4>'

		for key in json.keys():

			# initializes dat dict. Each key is a line in the graph
			data = {"A": {},
					"C": {},
					"G": {},
					"T": {},
					"N": {}}

			# lists to iterate through
			bases = json[key][read]["data"]
			positions = json[key][read]["col_names"]

			# vairables containing max percentage reached by any nucleotide in the sample
			#	This data is stored so it can be correctly marked in the sample check div.
			sample_status = None
			sample_max = 0

			# iterates through every position
			for i in range(len(positions)):

				# total base calls at that position, for some reason, this is not equal to the number 
				#	of input reads? Potential error. 
				total = bases[0][i] + bases[1][i] + bases[2][i] + bases[3][i] + bases[4][i]

				# list of values for heatmap, just cleaner to put them in a list before hand
				y_value_list = [(bases[0][i] / total) * 100, (bases[1][i] / total) * 100, 
								(bases[2][i] / total) * 100, (bases[3][i] / total) * 100,
								(bases[4][i] / total) * 100]

				# take max for position and compare it to max for entire sample
				sample_max = max([sample_max, max(y_value_list)])

				# add data to dictionary for each base
				data["A"][i] = y_value_list[0]
				data["C"][i] = y_value_list[1]
				data["G"][i] = y_value_list[2]
				data["T"][i] = y_value_list[3]
				data["N"][i] = y_value_list[4]


			# selects color to mark sample if a read has a region of low complextity
			if sample_max >= 60:
				sample_status = 'FAIL'
			elif sample_max >= 40:
				sample_status = 'QUESTIONABLE'
			else:
				sample_status = 'PASS'

			# adds color to sample in color dictionary
			status_dict[key] = sample_status

			# this config file is for the individual line of the multiline graph
			config["data_labels"].append({'name': key,'ylab': 'Percentage', 
										  'xlab': 'Cycle', 'yCeiling': 100, 'categories': True, 
										  'smooth_points_sumcounts': False})

			# append base by cycle to data for this to data list
			data_list.append(data)

		# this adds the html output of sample status. This function colors samples
		html += htstream_utils.sample_status(status_dict)

		# add line graphs
		html += linegraph.plot(data_list, config)

		return html
Example #60
0
    def add_readlen_data(self):
        data = {}
        for sample, readlen in self.readlen.items():
            data[sample] = {
                "evidence":
                readlen.get("Evidence"),
                "majoritypctdetected":
                round(float(readlen.get("MajorityPctDetected")) * 100.0, 2),
                "consensusreadlength":
                int(readlen.get("ConsensusReadLength")),
            }

        headers = OrderedDict()
        headers["consensusreadlength"] = {
            "title": "Read Length (bp)",
            "description": "Predicted read length from ngsderive.",
            "format": "{:,.d}",
        }
        headers["majoritypctdetected"] = {
            "title": "Read Length: % Supporting",
            "description":
            "Percentage of reads which were measured at the predicted read length.",
            "min": 0,
            "max": 100,
            "suffix": "%",
            "hidden": True,
        }
        self.general_stats_addcols(data, headers)

        linedata = [{}, {}]

        for sample, d in data.items():
            # Build dict of count data
            count_data = {}
            for parts in d.get("evidence").split(";"):
                (k, v) = parts.split("=")
                count_data[int(k)] = int(v)
            linedata[1][sample] = count_data

            # Build dict of percentage data
            total_reads = sum(count_data.values())
            linedata[0][sample] = {
                readlen: (count / total_reads) * 100.0
                for readlen, count in count_data.items()
            }

        # Config for the plot
        pconfig = {
            "id":
            "ngsderive_readlen_plot",
            "title":
            "ngsderive: Read Length",
            "xlab":
            "Read Length",
            "ylab":
            "% Evidence for Read Length",
            "data_labels": [
                {
                    "name": "Percentages",
                    "ylab": "% Evidence for Read Length"
                },
                {
                    "name": "Counts",
                    "ylab": "Number of reads"
                },
            ],
        }

        self.add_section(
            name="Read length",
            anchor="ngsderive-readlen",
            description=
            """Predicted read length provided by ngsderive. For more information, please see
            [the documentation](https://stjudecloud.github.io/ngsderive/subcommands/readlen/).""",
            plot=linegraph.plot(linedata, pconfig),
        )