Exemple #1
0
    def qorts_genebodycoverage_plot(self):
        """ Make a beeswarm plot of the GeneBodyCoverage values """

        keys = [
            'GeneBodyCoverage_Overall_Mean', 'GeneBodyCoverage_Overall_Median',
            'GeneBodyCoverage_LowExpress_Mean',
            'GeneBodyCoverage_LowExpress_Median',
            'GeneBodyCoverage_UMQuartile_Mean',
            'GeneBodyCoverage_UMQuartile_Median'
        ]
        cats = OrderedDict()
        for k in keys:
            name = k.replace('GeneBodyCoverage_', '')
            name = name.replace('_', ' ')
            name = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", name)
            cats[k] = {
                'title': name,
                'min': 0,
                'max': 1,
            }

        # Config for the plot
        pconfig = {
            'id': 'qorts_gene_body_coverage',
            'title': 'QoRTs: Gene Body Coverage'
        }

        self.add_section(name='Gene Body Coverage',
                         plot=beeswarm.plot(self.qorts_data, cats, pconfig))
Exemple #2
0
    def bam_statplot(self):
        pconfig = {"id": "rna_seqc_bam_stat_beeswarm", "title": "RNA-SeQC: Read metrics"}
        columns = [
            "Total Read Number",
            "Alternative Alignments",
            "Chimeric Reads",
            "Duplicate Reads",
            "End 1 Mapped Reads",
            "End 2 Mapped Reads",
            "End 1 Mismatches",
            "End 2 Mismatches",
            "End 1 Sense",
            "End 2 Sense",
            "Ambiguous Reads",
            "High Quality Reads",
            "Low Quality Reads",
            "Mapped Duplicate Reads",
            "Mapped Reads",
            "Mapped Unique Reads",
            "Non-Globin Reads",
            "Non-Globin Duplicate Reads",
            "rRNA Reads",
            "Unique Mapping, Vendor QC Passed Reads",
        ]
        keys = OrderedDict()
        for col in columns:
            keys[col] = {"title": col, "shared_key": "read_count", "suffix": config.read_count_prefix}

        self.add_section(
            name="Read Counts",
            anchor="rna_seqc_bam_stat",
            description="Number of reads ({}) falling into different categories.".format(config.read_count_desc),
            helptext="Note that many of these statistics are only available from RNA-SeQC v2.x",
            plot=beeswarm.plot(self.rna_seqc_metrics, keys, pconfig),
        )
Exemple #3
0
    def bam_statplot(self):
        pconfig = {
            'id': 'rna_seqc_bam_stat_beeswarm',
            'title': 'RNA-SeQC: Read metrics'
        }
        columns = [
            'Total Read Number', 'Alternative Alignments', 'Chimeric Reads',
            'Duplicate Reads', 'End 1 Mapped Reads', 'End 2 Mapped Reads',
            'End 1 Mismatches', 'End 2 Mismatches', 'End 1 Sense',
            'End 2 Sense', 'Ambiguous Reads', 'High Quality Reads',
            'Low Quality Reads', 'Mapped Duplicate Reads', 'Mapped Reads',
            'Mapped Unique Reads', 'Non-Globin Reads',
            'Non-Globin Duplicate Reads', 'rRNA Reads',
            'Unique Mapping, Vendor QC Passed Reads'
        ]
        keys = OrderedDict()
        for col in columns:
            keys[col] = {
                'title': col,
                'shared_key': 'read_count',
                'suffix': config.read_count_prefix
            }

        self.add_section(
            name='Read Counts',
            anchor='rna_seqc_bam_stat',
            description=
            'Number of reads ({}) falling into different categories.'.format(
                config.read_count_desc),
            helptext=
            'Note that many of these statistics are only available from RNA-SeQC v2.x',
            plot=beeswarm.plot(self.rna_seqc_metrics, keys, pconfig))
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name', c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name = modname,
            anchor = mod['config'].get('section_anchor', c_id),
            href = mod['config'].get('section_href'),
            info = mod['config'].get('description')
        )

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section( plot = table.plot(mod['data'], headers, pconfig) )
            self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) )

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) )

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section( plot = linegraph.plot(mod['data'], pconfig) )

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section( plot = scatter.plot(mod['data'], pconfig) )

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) )

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section( plot = beeswarm.plot(mod['data'], pconfig) )

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section( content = mod['data'] )

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section( content = mod['data'] )

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
Exemple #5
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name', c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name = modname,
            anchor = mod['config'].get('section_anchor', c_id),
            href = mod['config'].get('section_href'),
            info = mod['config'].get('description')
        )

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section( plot = table.plot(mod['data'], headers, pconfig) )
            self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) )

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) )

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section( plot = linegraph.plot(mod['data'], pconfig) )

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section( plot = scatter.plot(mod['data'], pconfig) )

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) )

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section( plot = beeswarm.plot(mod['data'], pconfig) )

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section( content = mod['data'] )

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section( content = mod['data'] )

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
Exemple #6
0
    def comp_qm_geometry_descriptor_chart(self,
                                          geometry_descriptor="bonds",
                                          min=0.0,
                                          max=2.0):
        """ Make the geometry_decriptor section tables and plots"""
        headers = OrderedDict()
        sample_keys = list(self.comp_qm_data[geometry_descriptor].keys())

        for key in self.comp_qm_data[geometry_descriptor][
                sample_keys[0]].keys():  # hack to get keys from first data set
            headers[key] = {
                'title': key,
                'description': 'Estimated Mulliken Charge',
                'suffix': '',
                'scale': 'Spectral',  # 'RdBu',
                'dmin': min,
                'dmax': max,
                'ceiling': max,
                'floor': min,  # known issue with a negative range
                'format': '{:,.2f}',
                'shared_key': 'torsion_range'
            }

        log.debug(str(headers))

        # Config for the plot
        config = {
            'namespace': 'Geometry',
            'id': 'comp_qm_geometry',
            'title': 'Geometry'
        }
        self.add_section(name=geometry_descriptor,
                         anchor='comp_qm_geometry',
                         plot=table.plot(
                             self.comp_qm_data[geometry_descriptor], headers,
                             config))
        #.. using dmin to ensure correct colour scales for negative ranges, note that min is still required for the beeswarm plot and that behaviour cancels out (?)
        #.. TLDR, the benefit of specifying a range is to keep all the graphs consistent. Maybe require an automated range check, rather than hardcoded. Try without if you like.
        for key in self.comp_qm_data[geometry_descriptor][
                sample_keys[0]].keys():  # hack to get keys from first data set
            headers[key] = {
                'title': key,
                'description': 'Estimated Mulliken Charge',
                'suffix': '',
                'scale': 'Spectral',  # 'RdBu',
                'min': min,
                'max': max,
                'ceiling': max,
                'floor': min,  # known issue with a negative range
                'format': '{:,.2f}',
                'shared_key': 'torsion_range'
            }
        self.add_section(name=geometry_descriptor,
                         anchor='comp_qm_geometry',
                         plot=beeswarm.plot(
                             self.comp_qm_data[geometry_descriptor], headers,
                             config))
Exemple #7
0
    def comp_qm_mulliken_chart(self):
        """ Make the mulliken section table and plots """
        headers = OrderedDict()
        sample_keys = list(self.comp_qm_data['mulliken'].keys())

        for key in self.comp_qm_data['mulliken'][
                sample_keys[0]].keys():  # hack to get keys from first data set
            headers[key] = {
                'title': key,
                'description': 'Mulliken Charge',
                'suffix': '',
                'scale': 'Spectral',  # 'RdBu',
                'dmin':
                -2.0,  # colour scale is set with dmin rather than min? see table.py  c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax'])
                'dmax': 2.0,
                'ceiling': 2.0,
                'floor': -2.0,  # known issue with a negative range
                'format': '{:,.2f}',
                'shared_key': 'mulliken_range'
            }

        log.debug(str(headers))

        # Config for the plot
        config = {
            'namespace': 'comp_qm',
            'id': 'comp_qm_mulliken',
            'title': 'Mulliken Charges',
        }
        self.add_section(name='Mulliken Charges',
                         anchor='comp_qm_mulliken_table',
                         plot=table.plot(self.comp_qm_data['mulliken'],
                                         headers, config))
        #.. using dmin to ensure correct colour scales for negative ranges, note that min is still required for the beeswarm plot and that behaviour cancels out (?)
        #.. TLDR, the benefit of specifying a range is to keep all the graphs consistent. Maybe require an automated range check, rather than hardcoded. Try without if you like.
        #.. still seems strange, what is wrong. see table.py  c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax'] and ../utils/mqc_colour.py
        for key in self.comp_qm_data['mulliken'][
                sample_keys[0]].keys():  # hack to get keys from first data set
            headers[key] = {
                'title': key,
                'description': 'Mulliken Charge',
                'suffix': '',
                'scale': 'Spectral',  # 'RdBu',
                'min':
                -2.0,  # colour scale is set with dmin rather than min? see table.py  c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax'])
                'max': 2.0,
                'ceiling': 2.0,
                'floor': -2.0,  # known issue with a negative range
                'format': '{:,.2f}',
                'shared_key': 'mulliken_range'
            }

        self.add_section(name='Mulliken Charges: Beeswarm plot',
                         anchor='comp_qm_mulliken_beeswarm',
                         plot=beeswarm.plot(self.comp_qm_data['mulliken'],
                                            headers, config))
Exemple #8
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name',
                                    c_id.replace('_', ' ').title())

        # Initialise the parent object
        super(MultiqcModule,
              self).__init__(name=modname,
                             anchor=mod['config'].get('section_anchor', c_id),
                             href=mod['config'].get('section_href'),
                             info=mod['config'].get('description'))

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            self.intro += table.plot(mod['data'], None, pconfig)

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.intro += bargraph.plot(mod['data'],
                                        mod['config'].get('categories'),
                                        pconfig)

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.intro += linegraph.plot(mod['data'], pconfig)

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.intro += scatter.plot(mod['data'], pconfig)

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.intro += heatmap.plot(mod['data'], mod['config'].get('xcats'),
                                       mod['config'].get('ycats'), pconfig)

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.intro += beeswarm.plot(mod['data'], pconfig)

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning(
                "Error - custom content plot type '{}' not recognised for content ID {}"
                .format(mod['config'].get('plot_type'), c_id))
Exemple #9
0
    def bismark_methlyation_chart(self):
        """ Make the methylation plot """

        # Config for the plot
        keys = OrderedDict()
        defaults = {'max': 100, 'min': 0, 'suffix': '%', 'decimalPlaces': 1}
        keys['percent_cpg_meth'] = dict(defaults,
                                        **{'title': 'Methylated CpG'})
        keys['percent_chg_meth'] = dict(defaults,
                                        **{'title': 'Methylated CHG'})
        keys['percent_chh_meth'] = dict(defaults,
                                        **{'title': 'Methylated CHH'})

        return beeswarm.plot(self.bismark_data['methextract'], keys,
                             {'id': 'bismark-methylation-dp'})
Exemple #10
0
    def bismark_methlyation_chart(self):
        """ Make the methylation plot """

        # Config for the plot
        keys = OrderedDict()
        defaults = {"max": 100, "min": 0, "suffix": "%", "decimalPlaces": 1}
        keys["percent_cpg_meth"] = dict(defaults, **{"title": "Methylated CpG"})
        keys["percent_chg_meth"] = dict(defaults, **{"title": "Methylated CHG"})
        keys["percent_chh_meth"] = dict(defaults, **{"title": "Methylated CHH"})

        self.add_section(
            name="Cytosine Methylation",
            anchor="bismark-methylation",
            plot=beeswarm.plot(self.bismark_data["methextract"], keys, {"id": "bismark-methylation-dp"}),
        )
Exemple #11
0
    def mirtop_beeswarm_section(self, stat_string):
        """ Generate more detailed beeswarm plots, for a given stat type"""

        log.info("Plotting " + stat_string + " section.")
        section_data = dict()
        for sample_name, sample_data in viewitems(self.mirtop_data):
            section_keys = [
                key for key in list(sample_data.keys()) if stat_string in key
            ]
            section_data[sample_name] = dict(
                (k, sample_data[k]) for k in section_keys)

        # Create comprehensive beeswarm plots of all stats
        self.add_section(name='Read ' + stat_string + 's',
                         anchor='mirtop-stats-' + stat_string,
                         description="Detailed summary stats",
                         plot=beeswarm.plot(section_data))
Exemple #12
0
    def bismark_methlyation_chart (self):
        """ Make the methylation plot """

        # Config for the plot
        keys = OrderedDict()
        defaults = {
            'max': 100,
            'min': 0,
            'suffix': '%',
            'decimalPlaces': 1
        }
        keys['percent_cpg_meth'] = dict(defaults, **{ 'title': 'Methylated CpG' })
        keys['percent_chg_meth'] = dict(defaults, **{ 'title': 'Methylated CHG' })
        keys['percent_chh_meth'] = dict(defaults, **{ 'title': 'Methylated CHH' })

        self.add_section (
            name = 'Cytosine Methylation',
            anchor = 'bismark-methylation',
            plot = beeswarm.plot(self.bismark_data['methextract'], keys, {'id': 'bismark-methylation-dp'})
        )
Exemple #13
0
    def parse_samtools_stats(self):
        """ Find Samtools stats logs and parse their data """

        self.samtools_stats = dict()
        for f in self.find_log_files('samtools/stats'):
            parsed_data = dict()
            for line in f['f'].splitlines():
                if not line.startswith("SN"):
                    continue
                sections = line.split("\t")
                field = sections[1].strip()[:-1]
                field = field.replace(' ', '_')
                value = float(sections[2].strip())
                parsed_data[field] = value

            if len(parsed_data) > 0:
                # Work out some percentages
                if 'raw_total_sequences' in parsed_data:
                    for k in list(parsed_data.keys()):
                        if k.startswith(
                                'reads_'
                        ) and k != 'raw_total_sequences' and parsed_data[
                                'raw_total_sequences'] > 0:
                            parsed_data['{}_percent'.format(k)] = (
                                parsed_data[k] /
                                parsed_data['raw_total_sequences']) * 100

                if f['s_name'] in self.samtools_stats:
                    log.debug(
                        "Duplicate sample name found! Overwriting: {}".format(
                            f['s_name']))
                self.add_data_source(f, section='stats')
                self.samtools_stats[f['s_name']] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_stats = self.ignore_samples(self.samtools_stats)

        if len(self.samtools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.samtools_stats, 'multiqc_samtools_stats')

            # General Stats Table
            stats_headers = OrderedDict()
            stats_headers['error_rate'] = {
                'title': 'Error rate',
                'description': 'Error rate using CIGAR',
                'min': 0,
                'max': 100,
                'suffix': '%',
                'scale': 'OrRd',
                'format': '{:,.2f}',
                'modify': lambda x: x * 100.0
            }
            stats_headers['non-primary_alignments'] = {
                'title':
                '{} Non-Primary'.format(config.read_count_prefix),
                'description':
                'Non-primary alignments ({})'.format(config.read_count_desc),
                'min':
                0,
                'scale':
                'PuBu',
                'modify':
                lambda x: x * config.read_count_multiplier,
                'shared_key':
                'read_count'
            }
            stats_headers['reads_mapped'] = {
                'title':
                '{} Reads Mapped'.format(config.read_count_prefix),
                'description':
                'Reads Mapped in the bam file ({})'.format(
                    config.read_count_desc),
                'min':
                0,
                'modify':
                lambda x: x * config.read_count_multiplier,
                'shared_key':
                'read_count'
            }
            stats_headers['reads_mapped_percent'] = {
                'title': '% Mapped',
                'description': '% Mapped Reads',
                'max': 100,
                'min': 0,
                'suffix': '%',
                'scale': 'RdYlGn'
            }
            stats_headers['raw_total_sequences'] = {
                'title':
                '{} Total seqs'.format(config.read_count_prefix),
                'description':
                'Total sequences in the bam file ({})'.format(
                    config.read_count_desc),
                'min':
                0,
                'modify':
                lambda x: x * config.read_count_multiplier,
                'shared_key':
                'read_count'
            }
            self.general_stats_addcols(self.samtools_stats, stats_headers,
                                       'Samtools Stats')

            # Make bargraph plot of mapped/unmapped reads
            self.alignment_section(self.samtools_stats)

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                'min': 0,
                'modify': lambda x: float(x) / 1000000.0,
                'suffix': 'M reads',
                'decimalPlaces': 2,
                'shared_key': 'read_count'
            }
            bases = {
                'min': 0,
                'modify': lambda x: float(x) / 1000000.0,
                'suffix': 'M bases',
                'decimalPlaces': 2,
                'shared_key': 'base_count'
            }
            keys['raw_total_sequences'] = dict(reads,
                                               **{'title': 'Total sequences'})
            keys['reads_mapped_and_paired'] = dict(
                reads, **{
                    'title':
                    'Mapped &amp; paired',
                    'description':
                    'Paired-end technology bit set + both mates mapped'
                })
            keys['reads_properly_paired'] = dict(
                reads, **{
                    'title': 'Properly paired',
                    'description': 'Proper-pair bit set'
                })
            keys['reads_duplicated'] = dict(
                reads, **{
                    'title': 'Duplicated',
                    'description': 'PCR or optical duplicate bit set'
                })
            keys['reads_QC_failed'] = dict(reads, **{'title': 'QC Failed'})
            keys['reads_MQ0'] = dict(
                reads, **{
                    'title': 'Reads MQ0',
                    'description': 'Reads mapped and MQ=0'
                })
            keys['bases_mapped_(cigar)'] = dict(
                bases, **{
                    'title': 'Mapped bases (cigar)',
                    'description': 'Mapped bases (cigar)'
                })
            keys['bases_trimmed'] = dict(bases, **{'title': 'Bases Trimmed'})
            keys['bases_duplicated'] = dict(bases,
                                            **{'title': 'Duplicated bases'})
            keys['pairs_on_different_chromosomes'] = dict(
                reads, **{
                    'title': 'Diff chromosomes',
                    'description': 'Pairs on different chromosomes'
                })
            keys['pairs_with_other_orientation'] = dict(
                reads, **{
                    'title': 'Other orientation',
                    'description': 'Pairs with other orientation'
                })
            keys['inward_oriented_pairs'] = dict(
                reads, **{
                    'title': 'Inward pairs',
                    'description': 'Inward oriented pairs'
                })
            keys['outward_oriented_pairs'] = dict(
                reads, **{
                    'title': 'Outward pairs',
                    'description': 'Outward oriented pairs'
                })

            self.add_section(
                name='Alignment metrics',
                anchor='samtools-stats',
                description=
                "This module parses the output from <code>samtools stats</code>. All numbers in millions.",
                plot=beeswarm.plot(self.samtools_stats, keys,
                                   {'id': 'samtools-stats-dp'}))

        # Return the number of logs that were found
        return len(self.samtools_stats)
Exemple #14
0
    def add_cc_section(self, c_id, mod):

        section_name = mod["config"].get("section_name",
                                         c_id.replace("_", " ").title())
        if section_name == "" or section_name is None:
            section_name = "Custom Content"

        section_description = mod["config"].get("description", "")

        pconfig = mod["config"].get("pconfig", {})
        if pconfig.get("title") is None:
            pconfig["title"] = section_name

        plot = None
        content = None

        # Table
        if mod["config"].get("plot_type") == "table":
            pconfig["sortRows"] = pconfig.get("sortRows", False)
            headers = mod["config"].get("headers")
            plot = table.plot(mod["data"], headers, pconfig)
            self.write_data_file(
                mod["data"],
                "multiqc_{}".format(section_name.lower().replace(" ", "_")))

        # Bar plot
        elif mod["config"].get("plot_type") == "bargraph":
            plot = bargraph.plot(mod["data"], mod["config"].get("categories"),
                                 pconfig)

        # Line plot
        elif mod["config"].get("plot_type") == "linegraph":
            plot = linegraph.plot(mod["data"], pconfig)

        # Scatter plot
        elif mod["config"].get("plot_type") == "scatter":
            plot = scatter.plot(mod["data"], pconfig)

        # Heatmap
        elif mod["config"].get("plot_type") == "heatmap":
            plot = heatmap.plot(mod["data"], mod["config"].get("xcats"),
                                mod["config"].get("ycats"), pconfig)

        # Beeswarm plot
        elif mod["config"].get("plot_type") == "beeswarm":
            plot = beeswarm.plot(mod["data"], pconfig)

        # Raw HTML
        elif mod["config"].get("plot_type") == "html":
            content = mod["data"]

        # Raw image file as html
        elif mod["config"].get("plot_type") == "image":
            content = mod["data"]

        # Not supplied
        elif mod["config"].get("plot_type") == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning(
                "Error - custom content plot type '{}' not recognised for content ID {}"
                .format(mod["config"].get("plot_type"), c_id))

        # Don't use exactly the same title / description text as the main module
        if section_name == self.name:
            section_name = None
        if section_description == self.info:
            section_description = ""

        self.add_section(name=section_name,
                         anchor=c_id,
                         description=section_description,
                         plot=plot,
                         content=content)
Exemple #15
0
    def add_cc_section(self, c_id, mod):

        section_name = mod['config'].get('section_name',
                                         c_id.replace('_', ' ').title())
        if section_name == '' or section_name is None:
            section_name = 'Custom Content'

        section_description = mod['config'].get('description', '')

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = section_name

        plot = None
        content = None

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            plot = table.plot(mod['data'], headers, pconfig)
            self.write_data_file(
                mod['data'],
                "multiqc_{}".format(section_name.lower().replace(' ', '_')))

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            plot = bargraph.plot(mod['data'], mod['config'].get('categories'),
                                 pconfig)

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            plot = linegraph.plot(mod['data'], pconfig)

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            plot = scatter.plot(mod['data'], pconfig)

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            plot = heatmap.plot(mod['data'], mod['config'].get('xcats'),
                                mod['config'].get('ycats'), pconfig)

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            plot = beeswarm.plot(mod['data'], pconfig)

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            content = mod['data']

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            content = mod['data']

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning(
                "Error - custom content plot type '{}' not recognised for content ID {}"
                .format(mod['config'].get('plot_type'), c_id))

        # Don't use exactly the same title / description text as the main module
        if section_name == self.name:
            section_name = None
        if section_description == self.info:
            section_description = ''

        self.add_section(name=section_name,
                         anchor=c_id,
                         description=section_description,
                         plot=plot,
                         content=content)
Exemple #16
0
    def parse_samtools_stats(self):
        """Find Samtools stats logs and parse their data"""

        self.samtools_stats = dict()
        for f in self.find_log_files("samtools/stats"):
            parsed_data = dict()
            for line in f["f"].splitlines():
                if not line.startswith("SN"):
                    continue
                sections = line.split("\t")
                field = sections[1].strip()[:-1]
                field = field.replace(" ", "_")
                value = float(sections[2].strip())
                parsed_data[field] = value

            if len(parsed_data) > 0:
                # Work out some percentages
                if "raw_total_sequences" in parsed_data:
                    for k in list(parsed_data.keys()):
                        if (
                            k.startswith("reads_")
                            and k != "raw_total_sequences"
                            and parsed_data["raw_total_sequences"] > 0
                        ):
                            parsed_data["{}_percent".format(k)] = (
                                parsed_data[k] / parsed_data["raw_total_sequences"]
                            ) * 100

                if f["s_name"] in self.samtools_stats:
                    log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"]))
                self.add_data_source(f, section="stats")
                self.samtools_stats[f["s_name"]] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_stats = self.ignore_samples(self.samtools_stats)

        if len(self.samtools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.samtools_stats, "multiqc_samtools_stats")

            # General Stats Table
            stats_headers = OrderedDict()
            stats_headers["error_rate"] = {
                "title": "Error rate",
                "description": "Error rate: mismatches (NM) / bases mapped (CIGAR)",
                "min": 0,
                "max": 100,
                "suffix": "%",
                "scale": "OrRd",
                "format": "{:,.2f}",
                "modify": lambda x: x * 100.0,
            }
            stats_headers["non-primary_alignments"] = {
                "title": "{} Non-Primary".format(config.read_count_prefix),
                "description": "Non-primary alignments ({})".format(config.read_count_desc),
                "min": 0,
                "scale": "PuBu",
                "modify": lambda x: x * config.read_count_multiplier,
                "shared_key": "read_count",
            }
            stats_headers["reads_mapped"] = {
                "title": "{} Reads Mapped".format(config.read_count_prefix),
                "description": "Reads Mapped in the bam file ({})".format(config.read_count_desc),
                "min": 0,
                "modify": lambda x: x * config.read_count_multiplier,
                "shared_key": "read_count",
            }
            stats_headers["reads_mapped_percent"] = {
                "title": "% Mapped",
                "description": "% Mapped Reads",
                "max": 100,
                "min": 0,
                "suffix": "%",
                "scale": "RdYlGn",
            }
            stats_headers["reads_properly_paired_percent"] = {
                "title": "% Proper Pairs",
                "description": "% Properly Paired Reads",
                "max": 100,
                "min": 0,
                "suffix": "%",
                "scale": "RdYlGn",
                "hidden": True
                if (max([x["reads_mapped_and_paired"] for x in self.samtools_stats.values()]) == 0)
                else False,
            }
            stats_headers["reads_MQ0_percent"] = {
                "title": "% MapQ 0 Reads",
                "description": "% of Reads that are Ambiguously Placed (MapQ=0)",
                "max": 100,
                "min": 0,
                "suffix": "%",
                "scale": "OrRd",
                "hidden": True,
            }
            stats_headers["raw_total_sequences"] = {
                "title": "{} Total seqs".format(config.read_count_prefix),
                "description": "Total sequences in the bam file ({})".format(config.read_count_desc),
                "min": 0,
                "modify": lambda x: x * config.read_count_multiplier,
                "shared_key": "read_count",
            }
            self.general_stats_addcols(self.samtools_stats, stats_headers)

            # Make bargraph plot of mapped/unmapped reads
            self.alignment_section(self.samtools_stats)

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                "min": 0,
                "modify": lambda x: float(x) / 1000000.0,
                "suffix": "M reads",
                "decimalPlaces": 2,
                "shared_key": "read_count",
            }
            bases = {
                "min": 0,
                "modify": lambda x: float(x) / 1000000.0,
                "suffix": "M bases",
                "decimalPlaces": 2,
                "shared_key": "base_count",
            }
            keys["raw_total_sequences"] = dict(reads, **{"title": "Total sequences"})
            keys["reads_mapped_and_paired"] = dict(
                reads,
                **{"title": "Mapped &amp; paired", "description": "Paired-end technology bit set + both mates mapped"},
            )
            keys["reads_properly_paired"] = dict(
                reads, **{"title": "Properly paired", "description": "Proper-pair bit set"}
            )
            keys["reads_duplicated"] = dict(
                reads, **{"title": "Duplicated", "description": "PCR or optical duplicate bit set"}
            )
            keys["reads_QC_failed"] = dict(reads, **{"title": "QC Failed"})
            keys["reads_MQ0"] = dict(reads, **{"title": "Reads MQ0", "description": "Reads mapped and MQ=0"})
            keys["bases_mapped_(cigar)"] = dict(
                bases, **{"title": "Mapped bases (CIGAR)", "description": "Mapped bases (CIGAR)"}
            )
            keys["bases_trimmed"] = dict(bases, **{"title": "Bases Trimmed"})
            keys["bases_duplicated"] = dict(bases, **{"title": "Duplicated bases"})
            keys["pairs_on_different_chromosomes"] = dict(
                reads, **{"title": "Diff chromosomes", "description": "Pairs on different chromosomes"}
            )
            keys["pairs_with_other_orientation"] = dict(
                reads, **{"title": "Other orientation", "description": "Pairs with other orientation"}
            )
            keys["inward_oriented_pairs"] = dict(
                reads, **{"title": "Inward pairs", "description": "Inward oriented pairs"}
            )
            keys["outward_oriented_pairs"] = dict(
                reads, **{"title": "Outward pairs", "description": "Outward oriented pairs"}
            )

            self.add_section(
                name="Alignment metrics",
                anchor="samtools-stats",
                description="This module parses the output from <code>samtools stats</code>. All numbers in millions.",
                plot=beeswarm.plot(self.samtools_stats, keys, {"id": "samtools-stats-dp"}),
            )

        # Return the number of logs that were found
        return len(self.samtools_stats)
Exemple #17
0
    def parse_samtools_flagstats(self):
        """ Find Samtools flagstat logs and parse their data """

        self.samtools_flagstat = dict()
        for f in self.find_log_files(config.sp['samtools']['flagstat']):
            parsed_data = parse_single_report(f['f'])
            if len(parsed_data) > 0:
                if f['s_name'] in self.samtools_flagstat:
                    log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name']))
                self.add_data_source(f, section='flagstat')
                self.samtools_flagstat[f['s_name']] = parsed_data

        if len(self.samtools_flagstat) > 0:

            # Write parsed report data to a file (restructure first)
            self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat')

            # General Stats Table
            flagstats_headers = dict()
            flagstats_headers['mapped_passed'] = {
                'title': 'M Reads Mapped',
                'description': 'Reads Mapped in the bam file',
                'min': 0,
                'modify': lambda x: x / 1000000,
                'shared_key': 'read_count'
            }
            self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat')

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                'min': 0,
                'modify': lambda x: float(x) / 1000000.0,
                'suffix': 'M reads',
                'decimalPlaces': 2,
                'shared_key': 'read_count'
            }
            keys['flagstat_total']              = dict(reads, title = 'Total Reads' )
            keys['total_passed']                = dict(reads, title = 'Total Passed QC' )
            keys['mapped_passed']               = dict(reads, title = 'Mapped' )

            if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()):
                keys['secondary_passed']        = dict(reads, title = 'Secondary Alignments' )

            if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()):
                keys['supplementary_passed']    = dict(reads, title = 'Supplementary Alignments' )

            keys['duplicates_passed']           = dict(reads, title = 'Duplicates' )
            keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' )
            keys['properly paired_passed']      = dict(reads, title = 'Properly Paired' )
            keys['with itself and mate mapped_passed'] = \
                                                  dict(reads, title = 'Self and mate mapped',
                                                              description = 'Reads with itself and mate mapped' )
            keys['singletons_passed']           = dict(reads, title = 'Singletons' )
            keys['with mate mapped to a different chr_passed'] = \
                                                  dict(reads, title = 'Mate mapped to diff chr',
                                                              description = 'Mate mapped to different chromosome' )
            keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \
                                                  dict(reads, title = 'Diff chr (mapQ >= 5)',
                                                              description = 'Mate mapped to different chromosome (mapQ >= 5)' )

            self.sections.append({
                'name': 'Samtools Flagstat',
                'anchor': 'samtools-flagstat',
                'content': '<p>This module parses the output from <code>samtools flagstat</code>. All numbers in millions.</p>' +
                            beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'})
            })

        # Return the number of logs that were found
        return len(self.samtools_flagstat)
def parse_reports(self):
    """ Find RSeQC bam_stat reports and parse their data """

    # Set up vars
    self.bam_stat_data = dict()
    regexes = {
        'total_records':
        r"Total records:\s*(\d+)",
        'qc_failed':
        r"QC failed:\s*(\d+)",
        'optical_pcr_duplicate':
        r"Optical/PCR duplicate:\s*(\d+)",
        'non_primary_hits':
        r"Non primary hits\s*(\d+)",
        'unmapped_reads':
        r"Unmapped reads:\s*(\d+)",
        'mapq_lt_mapq_cut_non-unique':
        r"mapq < mapq_cut \(non-unique\):\s*(\d+)",
        'mapq_gte_mapq_cut_unique':
        r"mapq >= mapq_cut \(unique\):\s*(\d+)",
        'read_1':
        r"Read-1:\s*(\d+)",
        'read_2':
        r"Read-2:\s*(\d+)",
        'reads_map_to_sense':
        r"Reads map to '\+':\s*(\d+)",
        'reads_map_to_antisense':
        r"Reads map to '-':\s*(\d+)",
        'non-splice_reads':
        r"Non-splice reads:\s*(\d+)",
        'splice_reads':
        r"Splice reads:\s*(\d+)",
        'reads_mapped_in_proper_pairs':
        r"Reads mapped in proper pairs:\s*(\d+)",
        'proper-paired_reads_map_to_different_chrom':
        r"Proper-paired reads map to different chrom:\s*(\d+)",
    }

    #intiate PE check
    is_paired_end = False

    # Go through files and parse data using regexes
    for f in self.find_log_files('rseqc/bam_stat'):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f['f'], re.MULTILINE)
            if r_search:
                d[k] = int(r_search.group(1))

        # Calculate some percentages
        if 'total_records' in d:
            t = float(d['total_records'])
            if 'mapq_gte_mapq_cut_unique' in d:
                d['unique_percent'] = (float(d['mapq_gte_mapq_cut_unique']) /
                                       t) * 100.0
            if 'reads_mapped_in_proper_pairs' in d:
                d['proper_pairs_percent'] = (
                    float(d['reads_mapped_in_proper_pairs']) / t) * 100.0

        if len(d) > 0:
            if f['s_name'] in self.bam_stat_data:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f['s_name']))
            self.add_data_source(f, section='bam_stat')
            #Check if SE or PE
            if d['read_2'] != 0:
                is_paired_end = True
            self.bam_stat_data[f['s_name']] = d

    # Filter to strip out ignored sample names
    self.bam_stat_data = self.ignore_samples(self.bam_stat_data)

    if len(self.bam_stat_data) > 0:
        # Write to file
        self.write_data_file(self.bam_stat_data, 'multiqc_rseqc_bam_stat')

        # Add to general stats table
        self.general_stats_headers['proper_pairs_percent'] = {
            'title': '% Proper Pairs',
            'description': '% Reads mapped in proper pairs',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'RdYlGn'
        }
        for s_name in self.bam_stat_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()

                #Only write if PE, i.e. there is something to write
                if is_paired_end:
                    self.general_stats_data[s_name].update(
                        self.bam_stat_data[s_name])

        # Make dot plot of counts
        pconfig = {'id': 'rseqc_bam_stat'}
        keys = OrderedDict()
        defaults = {
            'min': 0,
            'shared_key': 'read_count',
            'decimalPlaces': 2,
            'modify': lambda x: float(x) / 1000000.0,
        }
        keys['total_records'] = dict(defaults, **{'title': 'Total records'})
        keys['qc_failed'] = dict(defaults, **{'title': 'QC failed'})
        keys['optical_pcr_duplicate'] = dict(
            defaults, **{
                'title': 'Duplicates',
                'description': 'Optical/PCR duplicate'
            })
        keys['non_primary_hits'] = dict(defaults,
                                        **{'title': 'Non primary hit'})
        keys['unmapped_reads'] = dict(
            defaults, **{
                'title': 'Unmapped',
                'description': 'Unmapped reads'
            })
        keys['mapq_lt_mapq_cut_non'] = dict(
            defaults, **{
                'title': 'Non-unique',
                'description': 'mapq < mapq_cut (non-unique)'
            })
        keys['mapq_gte_mapq_cut_unique'] = dict(
            defaults, **{
                'title': 'Unique',
                'description': 'mapq >= mapq_cut (unique)'
            })
        if is_paired_end:
            keys['read_1'] = dict(defaults, **{'title': 'Read-1'})
            keys['read_2'] = dict(defaults, **{'title': 'Read-2'})
        keys['reads_map_to_sense'] = dict(
            defaults, **{
                'title': '+ve strand',
                'description': "Reads map to '+'"
            })
        keys['reads_map_to_antisense'] = dict(
            defaults, **{
                'title': '-ve strand',
                'description': "Reads map to '-'"
            })
        keys['non-splice_reads'] = dict(defaults,
                                        **{'title': 'Non-splice reads'})
        keys['splice_reads'] = dict(defaults, **{'title': 'Splice reads'})
        if is_paired_end:
            keys['reads_mapped_in_proper_pairs'] = dict(
                defaults, **{
                    'title': 'Proper pairs',
                    'description': 'Reads mapped in proper pairs'
                })
            keys['proper-paired_reads_map_to_different_chrom'] = dict(
                defaults, **{
                    'title': 'Different chrom',
                    'description': 'Proper-paired reads map to different chrom'
                })

        self.add_section(name='Bam Stat',
                         anchor='rseqc-bam_stat',
                         description='All numbers reported in millions.',
                         plot=beeswarm.plot(self.bam_stat_data, keys, pconfig))

    # Return number of samples found
    return len(self.bam_stat_data)
Exemple #19
0
def parse_reports(self):
    """ Find bamtools stats reports and parse their data """

    # Set up vars
    self.bamtools_stats_data = dict()
    regexes = {
        'total_reads': r"Total reads:\s*(\d+)",
        'mapped_reads': r"Mapped reads:\s*(\d+)",
        'mapped_reads_pct': r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)",
        'forward_strand': r"Forward strand:\s*(\d+)",
        'forward_strand_pct': r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)",
        'reverse_strand': r"Reverse strand:\s*(\d+)",
        'reverse_strand_pct': r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)",
        'failed_qc': r"Failed QC:\s*(\d+)",
        'failed_qc_pct': r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)",
        'duplicates': r"Duplicates:\s*(\d+)",
        'duplicates_pct': r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)",
        'paired_end': r"Paired-end reads:\s*(\d+)",
        'paired_end_pct': r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)",
        'proper_pairs': r"'Proper-pairs'\s*(\d+)",
        'proper_pairs_pct': r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)",
        'both_mapped': r"Both pairs mapped:\s*(\d+)",
        'both_mapped_pct': r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)",
        'read_1': r"Read 1:\s*(\d+)",
        'read_2': r"Read 2:\s*(\d+)",
        'singletons': r"Singletons:\s*(\d+)",
        'singletons_pct': r"Singletons:\s*\d+\s+\(([\d\.]+)%\)",
    }

    # Go through files and parse data using regexes
    for f in self.find_log_files('bamtools/stats'):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f['f'], re.MULTILINE)
            if r_search:
                d[k] = float(r_search.group(1))

        if len(d) > 0:
            if f['s_name'] in self.bamtools_stats_data:
                log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name']))
            self.add_data_source(f, section='stats')
            self.bamtools_stats_data[f['s_name']] = d

    # Filter to strip out ignored sample names
    self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data)

    if len(self.bamtools_stats_data) > 0:

        # Write to file
        self.write_data_file(self.bamtools_stats_data, 'multiqc_bamtools_stats')

        # Add to general stats table
        self.general_stats_headers['duplicates_pct'] = {
            'title': '% Duplicates',
            'description': '% Duplicate Reads',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'OrRd'
        }
        self.general_stats_headers['mapped_reads_pct'] = {
            'title': '% Mapped',
            'description': '% Mapped Reads',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'RdYlGn'
        }
        for s_name in self.bamtools_stats_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update( self.bamtools_stats_data[s_name] )

        # Make dot plot of counts
        keys = OrderedDict()
        defaults = {
            'min': 0,
            'max': 100,
            'decimalPlaces': 2,
            'suffix': '%'
        }
        num_defaults = {
            'min': 0,
            'modify': lambda x: float(x) / 1000000.0,
            'decimalPlaces': 2
        }

        keys['total_reads'] = dict(num_defaults, **{'title': 'Total reads', 'description': 'Total reads (millions)' });
        keys['mapped_reads_pct'] = dict(defaults, **{'title': 'Mapped reads' })
        keys['forward_strand_pct'] = dict(defaults, **{'title': 'Forward strand' })
        keys['reverse_strand_pct'] = dict(defaults, **{'title': 'Reverse strand' })
        keys['failed_qc_pct'] = dict(defaults, **{'title': 'Failed QC' })
        keys['duplicates_pct'] = dict(defaults, **{'title': 'Duplicates' })
        keys['paired_end_pct'] = dict(defaults, **{'title': 'Paired-end', 'description': 'Paired-end reads' })
        keys['proper_pairs_pct'] = dict(defaults, **{'title': 'Proper-pairs' })
        keys['both_mapped_pct'] = dict(defaults, **{'title': 'Both mapped', 'description': 'Both pairs mapped' })
        keys['read_1'] = dict(num_defaults, **{'title': 'Read 1', 'description': 'Read 1 (millions)' });
        keys['read_2'] = dict(num_defaults, **{'title': 'Read 2', 'description': 'Read 2 (millions)' });
        keys['singletons_pct'] = dict(defaults, **{'title': 'Singletons' })

        self.add_section (
            name = 'Bamtools Stats',
            anchor = 'bamtools-stats',
            plot = beeswarm.plot(self.bamtools_stats_data, keys)
        )

    # Return number of samples found
    return len(self.bamtools_stats_data)
Exemple #20
0
def parse_reports(self):
    """Find bamtools stats reports and parse their data"""

    # Set up vars
    self.bamtools_stats_data = dict()
    regexes = {
        "total_reads": r"Total reads:\s*(\d+)",
        "mapped_reads": r"Mapped reads:\s*(\d+)",
        "mapped_reads_pct": r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)",
        "forward_strand": r"Forward strand:\s*(\d+)",
        "forward_strand_pct": r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)",
        "reverse_strand": r"Reverse strand:\s*(\d+)",
        "reverse_strand_pct": r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)",
        "failed_qc": r"Failed QC:\s*(\d+)",
        "failed_qc_pct": r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)",
        "duplicates": r"Duplicates:\s*(\d+)",
        "duplicates_pct": r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)",
        "paired_end": r"Paired-end reads:\s*(\d+)",
        "paired_end_pct": r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)",
        "proper_pairs": r"'Proper-pairs'\s*(\d+)",
        "proper_pairs_pct": r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)",
        "both_mapped": r"Both pairs mapped:\s*(\d+)",
        "both_mapped_pct": r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)",
        "read_1": r"Read 1:\s*(\d+)",
        "read_2": r"Read 2:\s*(\d+)",
        "singletons": r"Singletons:\s*(\d+)",
        "singletons_pct": r"Singletons:\s*\d+\s+\(([\d\.]+)%\)",
    }

    # Go through files and parse data using regexes
    for f in self.find_log_files("bamtools/stats"):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f["f"], re.MULTILINE)
            if r_search:
                d[k] = float(r_search.group(1))

        if len(d) > 0:
            if f["s_name"] in self.bamtools_stats_data:
                log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"]))
            self.add_data_source(f, section="stats")
            self.bamtools_stats_data[f["s_name"]] = d

    # Filter to strip out ignored sample names
    self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data)

    if len(self.bamtools_stats_data) > 0:

        # Write to file
        self.write_data_file(self.bamtools_stats_data, "multiqc_bamtools_stats")

        # Add to general stats table
        self.general_stats_headers["duplicates_pct"] = {
            "title": "% Duplicates",
            "description": "% Duplicate Reads",
            "max": 100,
            "min": 0,
            "suffix": "%",
            "scale": "OrRd",
        }
        self.general_stats_headers["mapped_reads_pct"] = {
            "title": "% Mapped",
            "description": "% Mapped Reads",
            "max": 100,
            "min": 0,
            "suffix": "%",
            "scale": "RdYlGn",
        }
        for s_name in self.bamtools_stats_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(self.bamtools_stats_data[s_name])

        # Make dot plot of counts
        keys = OrderedDict()
        defaults = {"min": 0, "max": 100, "decimalPlaces": 2, "suffix": "%"}
        num_defaults = {"min": 0, "modify": lambda x: float(x) / 1000000.0, "decimalPlaces": 2}

        keys["total_reads"] = dict(num_defaults, **{"title": "Total reads", "description": "Total reads (millions)"})
        keys["mapped_reads_pct"] = dict(defaults, **{"title": "Mapped reads"})
        keys["forward_strand_pct"] = dict(defaults, **{"title": "Forward strand"})
        keys["reverse_strand_pct"] = dict(defaults, **{"title": "Reverse strand"})
        keys["failed_qc_pct"] = dict(defaults, **{"title": "Failed QC"})
        keys["duplicates_pct"] = dict(defaults, **{"title": "Duplicates"})
        keys["paired_end_pct"] = dict(defaults, **{"title": "Paired-end", "description": "Paired-end reads"})
        keys["proper_pairs_pct"] = dict(defaults, **{"title": "Proper-pairs"})
        keys["both_mapped_pct"] = dict(defaults, **{"title": "Both mapped", "description": "Both pairs mapped"})
        keys["bt_read_1"] = dict(num_defaults, **{"title": "Read 1", "description": "Read 1 (millions)"})
        keys["bt_read_2"] = dict(num_defaults, **{"title": "Read 2", "description": "Read 2 (millions)"})
        keys["singletons_pct"] = dict(defaults, **{"title": "Singletons"})

        self.add_section(
            name="Bamtools Stats", anchor="bamtools-stats", plot=beeswarm.plot(self.bamtools_stats_data, keys)
        )

    # Return number of samples found
    return len(self.bamtools_stats_data)
Exemple #21
0
def main():
    #
    # Usage statement
    #
    parseStr = 'Reads an excel spreadsheet of CIDR derived QC data and creates a multiqc-report like swarm plot.\n\n\
    Usage:\n\
        csi_cidr_stats.py -i qc_report_file -o output_html_file \n\n\
    Example:\n\
        csi_cidr_stats.py -i Holland_Release_Set_10_QC_Report.xlsx -o csi_cidr_stats.html\n'

    parser = argparse.ArgumentParser(description=parseStr,
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument(
        '-i',
        '--infile',
        required=True,
        nargs='?',
        type=argparse.FileType('r'),
        default=None,
        help=
        'Input CIDR QC Report Excel file, e.g. "Holland_Release_Set_10_QC_Report.xlsx"'
    )
    parser.add_argument(
        '-o',
        '--outfile',
        required=False,
        nargs='?',
        type=argparse.FileType('w'),
        default=None,
        help='Output QC Report HTML file, e.g. "Batch10_stats.html"')
    parser.add_argument('-t',
                        '--testmode',
                        required=False,
                        action='store_true',
                        default=False,
                        help='Run in test mode')

    args = parser.parse_args()
    infile = args.infile
    outfile = args.outfile
    testmode = args.testmode

    #####################################
    #
    # Set up the variables and the log file
    #
    #####################################

    # Set up the log file
    thedate = str(datetime.datetime.now()).split()[0]
    thedate = re.sub("-", "", thedate)
    global log
    log = open('csi_cidr_stats' + '.log', 'a')
    log.write('\n' + str(datetime.datetime.now()) + '\n')
    log.write(' '.join(sys.argv) + '\n')
    log.write('csi_cidr_stats.py version ' + __version__ + '\n\n')
    log.flush()

    ####################################
    #
    # Import Excel file
    #
    ####################################
    theColumns = [
        'SM_TAG', 'VERIFYBAM_AVG_DP', 'TOTAL_READS',
        'PCT_PF_READS_ALIGNED_PAIR', 'PF_HQ_ERROR_RATE_PAIR',
        'PF_HQ_ALIGNED_Q20_BASES_PAIR', 'UNMAPPED_READS',
        'MEAN_TARGET_COVERAGE', 'ZERO_CVG_TARGETS_PCT', 'PCT_EXC_MAPQ',
        'PCT_EXC_BASEQ', 'PCT_TARGET_BASES_1X', 'PCT_TARGET_BASES_2X',
        'PCT_TARGET_BASES_10X', 'PCT_TARGET_BASES_20X', 'PCT_TARGET_BASES_30X',
        'PCT_TARGET_BASES_40X', 'PCT_TARGET_BASES_50X', 'PCT_TARGET_BASES_100X'
    ]

    qcDict = import_cidr_stats(infile, theColumns)
    plot = beeswarm.plot(qcDict)
    print(type(plot))

    ######################################
    #
    # Close out and clean up
    #
    ######################################
    send_update("\ncsi_cidr_stats.py successfully completed", log)
    send_update(str(datetime.datetime.now()) + '\n', log)
    log.close()
Exemple #22
0
    def parse_samtools_stats(self):
        """ Find Samtools stats logs and parse their data """

        self.samtools_stats = dict()
        for f in self.find_log_files('samtools/stats'):
            parsed_data = dict()
            for line in f['f'].splitlines():
                if not line.startswith("SN"):
                    continue
                sections = line.split("\t")
                field = sections[1].strip()[:-1]
                field = field.replace(' ', '_')
                value = float(sections[2].strip())
                parsed_data[field] = value

            if len(parsed_data) > 0:
                # Work out some percentages
                if 'raw_total_sequences' in parsed_data:
                    for k in list(parsed_data.keys()):
                        if k.startswith('reads_') and k != 'raw_total_sequences' and parsed_data['raw_total_sequences'] > 0:
                            parsed_data['{}_percent'.format(k)] = (parsed_data[k] / parsed_data['raw_total_sequences']) * 100
                total_alignments = parsed_data['reads_mapped'] + parsed_data['non-primary_alignments']
                if total_alignments > 0:
                    parsed_data['non-primary_alignments_percent'] = (parsed_data['non-primary_alignments'] / total_alignments) * 100

                if f['s_name'] in self.samtools_stats:
                    log.debug("Duplicate sample name found! Overwriting: {}"
                              .format(f['s_name']))
                self.add_data_source(f, section='stats')
                self.samtools_stats[f['s_name']] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_stats = self.ignore_samples(self.samtools_stats)

        self.read_format = '{:,.1f}&nbsp;' + config.read_count_prefix
        if config.read_count_multiplier == 1:
            self.read_format = '{:,.0f}'

        if len(self.samtools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.samtools_stats, 'multiqc_samtools_stats')

            # General Stats Table
            stats_headers = OrderedDict()
            stats_headers['error_rate'] = {
                'title': 'Error rate',
                'description': 'Error rate: mismatches (NM) / bases mapped (CIGAR)',
                'min': 0,
                'max': 100,
                'suffix': '%',
                'scale': 'OrRd',
                'format': '{:,.2f}',
                'modify': lambda x: x * 100.0
            }
            stats_headers['reads_mapped'] = {
                'title': 'Mapped',
                'description': 'Reads mapped in the bam file ({})'.format(config.read_count_desc),
                'min': 0,
                'modify': lambda x: x * config.read_count_multiplier,
                'shared_key': 'read_count',
                'format': self.read_format,
            }
            stats_headers['reads_mapped_percent'] = {
                'title': 'Mapped',
                'description': '% Mapped reads',
                'max': 100,
                'min': 0,
                'suffix': '%',
                'scale': 'RdYlGn'
            }
            stats_headers['reads_properly_paired_percent'] = {
                'title': 'Pair',
                'description': '% Properly paired reads',
                'max': 100,
                'min': 0,
                'suffix': '%',
                'scale': 'RdYlGn',
                'hidden': True if (max([x['reads_mapped_and_paired'] for x in self.samtools_stats.values()]) == 0) else False
            }
            stats_headers['non-primary_alignments'] = {
                'title': '2ry'.format(config.read_count_prefix),
                'description': 'Non-primary alignments ({})'.format(config.read_count_desc),
                'min': 0,
                'scale': 'OrRd',
                'modify': lambda x: x * config.read_count_multiplier,
                'shared_key': 'read_count',
                'format': self.read_format,
            }
            stats_headers['non-primary_alignments_percent'] = {
                'title': '2ry'.format(config.read_count_prefix),
                'description': '% Non-primary alignments',
                'max': 100,
                'min': 0,
                'suffix': '%',
                'scale': 'OrRd',
            }
            stats_headers['reads_MQ0_percent'] = {
                'title': 'MQ0',
                'description': '% Reads that are ambiguously placed (MQ=0)',
                'max': 100,
                'min': 0,
                'suffix': '%',
                'scale': 'OrRd',
                'hidden': True
            }
            stats_headers['raw_total_sequences'] = {
                'title': 'Reads'.format(config.read_count_prefix),
                'description': 'Total sequences in the bam file ({})'.format(config.read_count_desc),
                'min': 0,
                'modify': lambda x: x * config.read_count_multiplier,
                'shared_key': 'read_count',
                'format': self.read_format,
            }
            self.general_stats_addcols(self.samtools_stats, stats_headers, 'Samtools Stats')

            # Make bargraph plot of mapped/unmapped reads
            self.alignment_section(self.samtools_stats)

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                'min': 0,
                'modify': lambda x: float(x) / 1000000.0,
                'suffix': 'M reads',
                'decimalPlaces': 2,
                'shared_key': 'read_count'
            }
            bases = {
                'min': 0,
                'modify': lambda x: float(x) / 1000000.0,
                'suffix': 'M bases',
                'decimalPlaces': 2,
                'shared_key': 'base_count'
            }
            keys['raw_total_sequences'] = dict(reads, **{'title': 'Total sequences'})
            keys['reads_mapped_and_paired'] = dict(reads, **{'title': 'Mapped &amp; paired', 'description': 'Paired-end technology bit set + both mates mapped' })
            keys['reads_properly_paired'] = dict(reads, **{'title': 'Properly paired', 'description': 'Proper-pair bit set'})
            keys['reads_duplicated'] = dict(reads, **{'title': 'Duplicated', 'description': 'PCR or optical duplicate bit set'})
            keys['reads_QC_failed'] = dict(reads, **{'title': 'QC Failed'})
            keys['reads_MQ0'] = dict(reads, **{'title': 'Reads MQ0', 'description': 'Reads mapped and MQ=0'})
            keys['bases_mapped_(cigar)'] = dict(bases, **{'title': 'Mapped bases (CIGAR)', 'description': 'Mapped bases (CIGAR)'})
            keys['bases_trimmed'] = dict(bases, **{'title': 'Bases Trimmed'})
            keys['bases_duplicated'] = dict(bases, **{'title': 'Duplicated bases'})
            keys['pairs_on_different_chromosomes'] = dict(reads, **{'title': 'Diff chromosomes', 'description': 'Pairs on different chromosomes'})
            keys['pairs_with_other_orientation'] = dict(reads, **{'title': 'Other orientation', 'description': 'Pairs with other orientation'})
            keys['inward_oriented_pairs'] = dict(reads, **{'title': 'Inward pairs', 'description': 'Inward oriented pairs'})
            keys['outward_oriented_pairs'] = dict(reads, **{'title': 'Outward pairs', 'description': 'Outward oriented pairs'})

            self.add_section (
                name = 'Alignment metrics',
                anchor = 'samtools-stats',
                description = "This module parses the output from <code>samtools stats</code>. All numbers in millions.",
                plot = beeswarm.plot(self.samtools_stats, keys, {'id': 'samtools-stats-dp'})
            )

        # Return the number of logs that were found
        return len(self.samtools_stats)
Exemple #23
0
 def beechart(self, s_name):
     print("gg", self.data2)
     return beeswarm.plot(self.data2)
Exemple #24
0
    def chart_qc_cv(self):
        '''
        Charts _cv_table.txt
        Inputs:
            No inputs
        Returns:
            No returns, generates Sequencing Depth - Whole Genome chart
        '''

        cats = [
            ('all_base', 'a_b'),
            ('q40_base', 'q_b'),
            ('all_base_botgc', 'a_b_b'),
            ('q40_base_botgc', 'q_b_b'),
            ('all_base_topgc', 'a_b_t'),
            ('q40_base_topgc', 'q_b_t'),
            ('all_cpg', 'a_c'),
            ('q40_cpg', 'q_c'),
            ('all_cpg_botgc', 'a_c_b'),
            ('q40_cpg_botgc', 'q_c_b'),
            ('all_cpg_topgc', 'a_c_t'),
            ('q40_cpg_topgc', 'q_c_t')
        ]

        pd = OrderedDict()
        for s_name, dd in self.mdata['qc_cv'].items():
            data = OrderedDict()
            for cat, key in cats:
                if cat in dd:
                    if dd[cat]['mu'] != -1:
                        data['mu_'+key] = dd[cat]['mu']
                        data['cv_'+key] = dd[cat]['cv']
            if len(data) > 0:
                pd[s_name] = data

        shared_mean = {'min': 0, 'format': '{:,3f}', 'minRange': 10}
        shared_cofv = {'min': 0, 'format': '{:,3f}', 'minRange': 50}

        pheader = OrderedDict()
        pheader['mu_a_b'] = dict(shared_mean, **{'title': 'All Genome Mean', 'description': 'Mean Sequencing Depth for All Reads'})
        pheader['mu_q_b'] = dict(shared_mean, **{'title': 'Q40 Genome Mean', 'description': 'Mean Sequencing Depth for Q40 Reads'})
        pheader['mu_a_b_b'] = dict(shared_mean, **{'title': 'Low GC All Gen. Mean', 'description': 'Mean Sequencing Depth for All Reads in Low GC-Content Regions'})
        pheader['mu_q_b_b'] = dict(shared_mean, **{'title': 'Low GC Q40 Gen. Mean', 'description': 'Mean Sequencing Depth for Q40 Reads in Low GC-Content Regions'})
        pheader['mu_a_b_t'] = dict(shared_mean, **{'title': 'High GC All Gen. Mean', 'description': 'Mean Sequencing Depth for All Reads in High GC-Content Regions'})
        pheader['mu_q_b_t'] = dict(shared_mean, **{'title': 'High GC Q40 Gen. Mean', 'description': 'Mean Sequencing Depth for Q40 Reads in High GC-Content Regions'})
        pheader['cv_a_b'] = dict(shared_cofv, **{'title': 'All Genome CoV', 'description': 'Sequencing Depth CoV for All Reads'})
        pheader['cv_q_b'] = dict(shared_cofv, **{'title': 'Q40 Genome CoV', 'description': 'Sequencing Depth CoV for Q40 Reads'})
        pheader['cv_a_b_b'] = dict(shared_cofv, **{'title': 'Low GC All Gen. CoV', 'description': 'Sequencing Depth CoV for All Reads in Low GC-Content Regions'})
        pheader['cv_q_b_b'] = dict(shared_cofv, **{'title': 'Low GC Q40 Gen. CoV', 'description': 'Sequencing Depth CoV for Q40 Reads in Low GC-Content Regions'})
        pheader['cv_a_b_t'] = dict(shared_cofv, **{'title': 'High GC All Gen. CoV', 'description': 'Sequencing Depth CoV for All Reads in High GC-Content Regions'})
        pheader['cv_q_b_t'] = dict(shared_cofv, **{'title': 'High GC Q40 Gen. CoV', 'description': 'Sequencing Depth CoV for Q40 Reads in High GC-Content Regions'})

        pheader['mu_a_c'] = dict(shared_mean, **{'title': 'All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs'})
        pheader['mu_q_c'] = dict(shared_mean, **{'title': 'Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs'})
        pheader['mu_a_c_b'] = dict(shared_mean, **{'title': 'Low GC All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs in Low GC-Content Regions'})
        pheader['mu_q_c_b'] = dict(shared_mean, **{'title': 'Low GC Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs in Low GC-Content Regions'})
        pheader['mu_a_c_t'] = dict(shared_mean, **{'title': 'High GC All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs in High GC-Content Regions'})
        pheader['mu_q_c_t'] = dict(shared_mean, **{'title': 'High GC Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs in High GC-Content Regions'})
        pheader['cv_a_c'] = dict(shared_cofv, **{'title': 'All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs'})
        pheader['cv_q_c'] = dict(shared_cofv, **{'title': 'Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs'})
        pheader['cv_a_c_b'] = dict(shared_cofv, **{'title': 'Low GC All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs in Low GC-Content Regions'})
        pheader['cv_q_c_b'] = dict(shared_cofv, **{'title': 'Low GC Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs in Low GC-Content Regions'})
        pheader['cv_a_c_t'] = dict(shared_cofv, **{'title': 'High GC All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs in High GC-Content Regions'})
        pheader['cv_q_c_t'] = dict(shared_cofv, **{'title': 'High GC Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs in High GC-Content Regions'})

        pconfig = {
            'id': 'biscuit_seq_depth',
            'table_title': 'BISCUIT: Sequencing Depth',
            'sortRows': False
        }

        if len(pd) > 0:
            self.add_section(
                name = 'Sequencing Depth Statistics',
                anchor = 'biscuit-seq-depth',
                description = '''
                    Shows the sequence depth mean and uniformity measured by the Coefficient of Variation
                    (`CoV`, defined as `stddev/mean`).
                ''',
                helptext = '''
                    The plot shows coverage across different selections:

                    * _Genome_ (Gen.) - Statistics for all bases across the entire genome
                    * _CpGs_ - Statistics for CpGs
                    * _All_ - Statistics for any mapped bases/CpGs
                    * _Q40_ - Statistics only those bases/CpGs with mapping quality `MAPQ >= 40`
                    * _High GC_ - Bases / CpGs that overlap with the top 10% of 100bp windows for GC-content
                    * _Low GC_ - Bases / CpGs that overlap with the bottom 10% of 100bp windows for GC-content

                ''',
                plot = beeswarm.plot(pd, pheader, pconfig)
            )
def parse_reports(self):
    """ Find bamtools stats reports and parse their data """

    # Set up vars
    self.bamtools_stats_data = dict()
    regexes = {
        'total_reads': r"Total reads:\s*(\d+)",
        'mapped_reads': r"Mapped reads:\s*(\d+)",
        'mapped_reads_pct': r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)",
        'forward_strand': r"Forward strand:\s*(\d+)",
        'forward_strand_pct': r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)",
        'reverse_strand': r"Reverse strand:\s*(\d+)",
        'reverse_strand_pct': r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)",
        'failed_qc': r"Failed QC:\s*(\d+)",
        'failed_qc_pct': r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)",
        'duplicates': r"Duplicates:\s*(\d+)",
        'duplicates_pct': r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)",
        'paired_end': r"Paired-end reads:\s*(\d+)",
        'paired_end_pct': r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)",
        'proper_pairs': r"'Proper-pairs'\s*(\d+)",
        'proper_pairs_pct': r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)",
        'both_mapped': r"Both pairs mapped:\s*(\d+)",
        'both_mapped_pct': r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)",
        'read_1': r"Read 1:\s*(\d+)",
        'read_2': r"Read 2:\s*(\d+)",
        'singletons': r"Singletons:\s*(\d+)",
        'singletons_pct': r"Singletons:\s*\d+\s+\(([\d\.]+)%\)",
    }

    # Go through files and parse data using regexes
    for f in self.find_log_files('bamtools/stats'):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f['f'], re.MULTILINE)
            if r_search:
                d[k] = float(r_search.group(1))

        if len(d) > 0:
            if f['s_name'] in self.bamtools_stats_data:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f['s_name']))
            self.add_data_source(f, section='stats')
            self.bamtools_stats_data[f['s_name']] = d

    # Filter to strip out ignored sample names
    self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data)

    if len(self.bamtools_stats_data) > 0:

        # Write to file
        self.write_data_file(self.bamtools_stats_data,
                             'multiqc_bamtools_stats')

        # Add to general stats table
        self.general_stats_headers['duplicates_pct'] = {
            'title': '% Duplicates',
            'description': '% Duplicate Reads',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'OrRd'
        }
        self.general_stats_headers['mapped_reads_pct'] = {
            'title': '% Mapped',
            'description': '% Mapped Reads',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'RdYlGn'
        }
        for s_name in self.bamtools_stats_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(
                self.bamtools_stats_data[s_name])

        # Make dot plot of counts
        keys = OrderedDict()
        defaults = {'min': 0, 'max': 100, 'decimalPlaces': 2, 'suffix': '%'}
        num_defaults = {
            'min': 0,
            'modify': lambda x: float(x) / 1000000.0,
            'decimalPlaces': 2
        }

        keys['total_reads'] = dict(
            num_defaults, **{
                'title': 'Total reads',
                'description': 'Total reads (millions)'
            })
        keys['mapped_reads_pct'] = dict(defaults, **{'title': 'Mapped reads'})
        keys['forward_strand_pct'] = dict(defaults,
                                          **{'title': 'Forward strand'})
        keys['reverse_strand_pct'] = dict(defaults,
                                          **{'title': 'Reverse strand'})
        keys['failed_qc_pct'] = dict(defaults, **{'title': 'Failed QC'})
        keys['duplicates_pct'] = dict(defaults, **{'title': 'Duplicates'})
        keys['paired_end_pct'] = dict(
            defaults, **{
                'title': 'Paired-end',
                'description': 'Paired-end reads'
            })
        keys['proper_pairs_pct'] = dict(defaults, **{'title': 'Proper-pairs'})
        keys['both_mapped_pct'] = dict(
            defaults, **{
                'title': 'Both mapped',
                'description': 'Both pairs mapped'
            })
        keys['bt_read_1'] = dict(
            num_defaults, **{
                'title': 'Read 1',
                'description': 'Read 1 (millions)'
            })
        keys['bt_read_2'] = dict(
            num_defaults, **{
                'title': 'Read 2',
                'description': 'Read 2 (millions)'
            })
        keys['singletons_pct'] = dict(defaults, **{'title': 'Singletons'})

        self.add_section(name='Bamtools Stats',
                         anchor='bamtools-stats',
                         plot=beeswarm.plot(self.bamtools_stats_data, keys))

    # Return number of samples found
    return len(self.bamtools_stats_data)
Exemple #26
0
    def chart_qc_cv(self):
        """
        Charts _cv_table.txt
        Inputs:
            No inputs
        Returns:
            No returns, generates Sequencing Depth - Whole Genome chart
        """

        cats = [
            ("all_base", "a_b"),
            ("q40_base", "q_b"),
            ("all_base_botgc", "a_b_b"),
            ("q40_base_botgc", "q_b_b"),
            ("all_base_topgc", "a_b_t"),
            ("q40_base_topgc", "q_b_t"),
            ("all_cpg", "a_c"),
            ("q40_cpg", "q_c"),
            ("all_cpg_botgc", "a_c_b"),
            ("q40_cpg_botgc", "q_c_b"),
            ("all_cpg_topgc", "a_c_t"),
            ("q40_cpg_topgc", "q_c_t"),
        ]

        pd = OrderedDict()
        for s_name, dd in self.mdata["qc_cv"].items():
            data = OrderedDict()
            for cat, key in cats:
                if cat in dd:
                    if dd[cat]["mu"] != -1:
                        data["mu_" + key] = dd[cat]["mu"]
                        data["cv_" + key] = dd[cat]["cv"]
            if len(data) > 0:
                pd[s_name] = data

        shared_mean = {"min": 0, "format": "{:,3f}", "minRange": 10}
        shared_cofv = {"min": 0, "format": "{:,3f}", "minRange": 50}

        pheader = OrderedDict()
        pheader["mu_a_b"] = dict(
            shared_mean, **{"title": "All Genome Mean", "description": "Mean Sequencing Depth for All Reads"}
        )
        pheader["mu_q_b"] = dict(
            shared_mean, **{"title": "Q40 Genome Mean", "description": "Mean Sequencing Depth for Q40 Reads"}
        )
        pheader["mu_a_b_b"] = dict(
            shared_mean,
            **{
                "title": "Low GC All Gen. Mean",
                "description": "Mean Sequencing Depth for All Reads in Low GC-Content Regions",
            },
        )
        pheader["mu_q_b_b"] = dict(
            shared_mean,
            **{
                "title": "Low GC Q40 Gen. Mean",
                "description": "Mean Sequencing Depth for Q40 Reads in Low GC-Content Regions",
            },
        )
        pheader["mu_a_b_t"] = dict(
            shared_mean,
            **{
                "title": "High GC All Gen. Mean",
                "description": "Mean Sequencing Depth for All Reads in High GC-Content Regions",
            },
        )
        pheader["mu_q_b_t"] = dict(
            shared_mean,
            **{
                "title": "High GC Q40 Gen. Mean",
                "description": "Mean Sequencing Depth for Q40 Reads in High GC-Content Regions",
            },
        )
        pheader["cv_a_b"] = dict(
            shared_cofv, **{"title": "All Genome CoV", "description": "Sequencing Depth CoV for All Reads"}
        )
        pheader["cv_q_b"] = dict(
            shared_cofv, **{"title": "Q40 Genome CoV", "description": "Sequencing Depth CoV for Q40 Reads"}
        )
        pheader["cv_a_b_b"] = dict(
            shared_cofv,
            **{
                "title": "Low GC All Gen. CoV",
                "description": "Sequencing Depth CoV for All Reads in Low GC-Content Regions",
            },
        )
        pheader["cv_q_b_b"] = dict(
            shared_cofv,
            **{
                "title": "Low GC Q40 Gen. CoV",
                "description": "Sequencing Depth CoV for Q40 Reads in Low GC-Content Regions",
            },
        )
        pheader["cv_a_b_t"] = dict(
            shared_cofv,
            **{
                "title": "High GC All Gen. CoV",
                "description": "Sequencing Depth CoV for All Reads in High GC-Content Regions",
            },
        )
        pheader["cv_q_b_t"] = dict(
            shared_cofv,
            **{
                "title": "High GC Q40 Gen. CoV",
                "description": "Sequencing Depth CoV for Q40 Reads in High GC-Content Regions",
            },
        )

        pheader["mu_a_c"] = dict(
            shared_mean, **{"title": "All CpGs Mean", "description": "Mean Sequencing Depth for All CpGs"}
        )
        pheader["mu_q_c"] = dict(
            shared_mean, **{"title": "Q40 CpGs Mean", "description": "Mean Sequencing Depth for Q40 CpGs"}
        )
        pheader["mu_a_c_b"] = dict(
            shared_mean,
            **{
                "title": "Low GC All CpGs Mean",
                "description": "Mean Sequencing Depth for All CpGs in Low GC-Content Regions",
            },
        )
        pheader["mu_q_c_b"] = dict(
            shared_mean,
            **{
                "title": "Low GC Q40 CpGs Mean",
                "description": "Mean Sequencing Depth for Q40 CpGs in Low GC-Content Regions",
            },
        )
        pheader["mu_a_c_t"] = dict(
            shared_mean,
            **{
                "title": "High GC All CpGs Mean",
                "description": "Mean Sequencing Depth for All CpGs in High GC-Content Regions",
            },
        )
        pheader["mu_q_c_t"] = dict(
            shared_mean,
            **{
                "title": "High GC Q40 CpGs Mean",
                "description": "Mean Sequencing Depth for Q40 CpGs in High GC-Content Regions",
            },
        )
        pheader["cv_a_c"] = dict(
            shared_cofv, **{"title": "All CpGs CoV", "description": "Sequencing Depth CoV for All CpGs"}
        )
        pheader["cv_q_c"] = dict(
            shared_cofv, **{"title": "Q40 CpGs CoV", "description": "Sequencing Depth CoV for Q40 CpGs"}
        )
        pheader["cv_a_c_b"] = dict(
            shared_cofv,
            **{
                "title": "Low GC All CpGs CoV",
                "description": "Sequencing Depth CoV for All CpGs in Low GC-Content Regions",
            },
        )
        pheader["cv_q_c_b"] = dict(
            shared_cofv,
            **{
                "title": "Low GC Q40 CpGs CoV",
                "description": "Sequencing Depth CoV for Q40 CpGs in Low GC-Content Regions",
            },
        )
        pheader["cv_a_c_t"] = dict(
            shared_cofv,
            **{
                "title": "High GC All CpGs CoV",
                "description": "Sequencing Depth CoV for All CpGs in High GC-Content Regions",
            },
        )
        pheader["cv_q_c_t"] = dict(
            shared_cofv,
            **{
                "title": "High GC Q40 CpGs CoV",
                "description": "Sequencing Depth CoV for Q40 CpGs in High GC-Content Regions",
            },
        )

        pconfig = {"id": "biscuit_seq_depth", "table_title": "BISCUIT: Sequencing Depth", "sortRows": False}

        if len(pd) > 0:
            self.add_section(
                name="Sequencing Depth Statistics",
                anchor="biscuit-seq-depth",
                description="""
                    Shows the sequence depth mean and uniformity measured by the Coefficient of Variation
                    (`CoV`, defined as `stddev/mean`).
                """,
                helptext="""
                    The plot shows coverage across different selections:

                    * _Genome_ (Gen.) - Statistics for all bases across the entire genome
                    * _CpGs_ - Statistics for CpGs
                    * _All_ - Statistics for any mapped bases/CpGs
                    * _Q40_ - Statistics only those bases/CpGs with mapping quality `MAPQ >= 40`
                    * _High GC_ - Bases / CpGs that overlap with the top 10% of 100bp windows for GC-content
                    * _Low GC_ - Bases / CpGs that overlap with the bottom 10% of 100bp windows for GC-content

                """,
                plot=beeswarm.plot(pd, pheader, pconfig),
            )
Exemple #27
0
    def parse_samtools_flagstats(self):
        """ Find Samtools flagstat logs and parse their data """

        self.samtools_flagstat = dict()
        for f in self.find_log_files("samtools/flagstat"):
            parsed_data = parse_single_report(f["f"])
            if len(parsed_data) > 0:
                if f["s_name"] in self.samtools_flagstat:
                    log.debug(
                        "Duplicate sample name found! Overwriting: {}".format(
                            f["s_name"]))
                self.add_data_source(f, section="flagstat")
                self.samtools_flagstat[f["s_name"]] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat)

        if len(self.samtools_flagstat) > 0:

            # Write parsed report data to a file (restructure first)
            self.write_data_file(self.samtools_flagstat,
                                 "multiqc_samtools_flagstat")

            # General Stats Table
            flagstats_headers = dict()
            flagstats_headers["flagstat_total"] = {
                "title":
                "{} Reads".format(config.read_count_prefix),
                "description":
                "Total reads in the bam file ({})".format(
                    config.read_count_desc),
                "min":
                0,
                "modify":
                lambda x: x * config.read_count_multiplier,
                "shared_key":
                "read_count",
                "placement":
                100.0,
                "hidden":
                True,
            }
            flagstats_headers["mapped_passed"] = {
                "title":
                "{} Reads Mapped".format(config.read_count_prefix),
                "description":
                "Reads Mapped in the bam file ({})".format(
                    config.read_count_desc),
                "min":
                0,
                "modify":
                lambda x: x * config.read_count_multiplier,
                "shared_key":
                "read_count",
                "placement":
                101.0,
            }
            self.general_stats_addcols(self.samtools_flagstat,
                                       flagstats_headers)

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                "min": 0,
                "modify": lambda x: float(x) * config.read_count_multiplier,
                "suffix": "{} reads".format(config.read_count_prefix),
                "decimalPlaces": 2,
                "shared_key": "read_count",
            }
            keys["flagstat_total"] = dict(reads, title="Total Reads")
            keys["total_passed"] = dict(reads, title="Total Passed QC")
            keys["mapped_passed"] = dict(reads, title="Mapped")

            if any(
                    v.get("secondary_passed")
                    for v in self.samtools_flagstat.values()):
                keys["secondary_passed"] = dict(reads,
                                                title="Secondary Alignments")

            if any(
                    v.get("supplementary_passed")
                    for v in self.samtools_flagstat.values()):
                keys["supplementary_passed"] = dict(
                    reads, title="Supplementary Alignments")

            keys["duplicates_passed"] = dict(reads, title="Duplicates")
            keys["paired in sequencing_passed"] = dict(
                reads, title="Paired in Sequencing")
            keys["properly paired_passed"] = dict(reads,
                                                  title="Properly Paired")
            keys["with itself and mate mapped_passed"] = dict(
                reads,
                title="Self and mate mapped",
                description="Reads with itself and mate mapped")
            keys["singletons_passed"] = dict(reads, title="Singletons")
            keys["with mate mapped to a different chr_passed"] = dict(
                reads,
                title="Mate mapped to diff chr",
                description="Mate mapped to different chromosome")
            keys["with mate mapped to a different chr (mapQ >= 5)_passed"] = dict(
                reads,
                title="Diff chr (mapQ >= 5)",
                description="Mate mapped to different chromosome (mapQ >= 5)")

            self.add_section(
                name="Samtools Flagstat",
                anchor="samtools-flagstat",
                description=
                "This module parses the output from <code>samtools flagstat</code>. All numbers in millions.",
                plot=beeswarm.plot(self.samtools_flagstat, keys,
                                   {"id": "samtools-flagstat-dp"}),
            )

        # Return the number of logs that were found
        return len(self.samtools_flagstat)
Exemple #28
0
    def parse_samtools_flagstats(self):
        """ Find Samtools flagstat logs and parse their data """

        self.samtools_flagstat = dict()
        for f in self.find_log_files('samtools/flagstat'):
            parsed_data = parse_single_report(f['f'])
            if len(parsed_data) > 0:
                if f['s_name'] in self.samtools_flagstat:
                    log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name']))
                self.add_data_source(f, section='flagstat')
                self.samtools_flagstat[f['s_name']] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat)

        if len(self.samtools_flagstat) > 0:

            # Write parsed report data to a file (restructure first)
            self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat')

            # General Stats Table
            flagstats_headers = dict()
            flagstats_headers['mapped_passed'] = {
                'title': '{} Reads Mapped'.format(config.read_count_prefix),
                'description': 'Reads Mapped in the bam file ({})'.format(config.read_count_desc),
                'min': 0,
                'modify': lambda x: x * config.read_count_multiplier,
                'shared_key': 'read_count',
                'placement' : 100.0
            }
            self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat')

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                'min': 0,
                'modify': lambda x: float(x) * config.read_count_multiplier,
                 'suffix': '{} reads'.format(config.read_count_prefix),
                'decimalPlaces': 2,
                'shared_key': 'read_count'
            }
            keys['flagstat_total']              = dict(reads, title = 'Total Reads' )
            keys['total_passed']                = dict(reads, title = 'Total Passed QC' )
            keys['mapped_passed']               = dict(reads, title = 'Mapped' )

            if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()):
                keys['secondary_passed']        = dict(reads, title = 'Secondary Alignments' )

            if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()):
                keys['supplementary_passed']    = dict(reads, title = 'Supplementary Alignments' )

            keys['duplicates_passed']           = dict(reads, title = 'Duplicates' )
            keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' )
            keys['properly paired_passed']      = dict(reads, title = 'Properly Paired' )
            keys['with itself and mate mapped_passed'] = \
                                                  dict(reads, title = 'Self and mate mapped',
                                                              description = 'Reads with itself and mate mapped' )
            keys['singletons_passed']           = dict(reads, title = 'Singletons' )
            keys['with mate mapped to a different chr_passed'] = \
                                                  dict(reads, title = 'Mate mapped to diff chr',
                                                              description = 'Mate mapped to different chromosome' )
            keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \
                                                  dict(reads, title = 'Diff chr (mapQ >= 5)',
                                                              description = 'Mate mapped to different chromosome (mapQ >= 5)' )

            self.add_section (
                name = 'Samtools Flagstat',
                anchor = 'samtools-flagstat',
                description = 'This module parses the output from <code>samtools flagstat</code>. All numbers in millions.',
                plot = beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'})
            )

        # Return the number of logs that were found
        return len(self.samtools_flagstat)
Exemple #29
0
def parse_reports(self):
    """ Find RSeQC bam_stat reports and parse their data """

    # Set up vars
    self.bam_stat_data = dict()
    regexes = {
        "total_records": r"Total records:\s*(\d+)",
        "qc_failed": r"QC failed:\s*(\d+)",
        "optical_pcr_duplicate": r"Optical/PCR duplicate:\s*(\d+)",
        "non_primary_hits": r"Non primary hits\s*(\d+)",
        "unmapped_reads": r"Unmapped reads:\s*(\d+)",
        "mapq_lt_mapq_cut_non-unique": r"mapq < mapq_cut \(non-unique\):\s*(\d+)",
        "mapq_gte_mapq_cut_unique": r"mapq >= mapq_cut \(unique\):\s*(\d+)",
        "read_1": r"Read-1:\s*(\d+)",
        "read_2": r"Read-2:\s*(\d+)",
        "reads_map_to_sense": r"Reads map to '\+':\s*(\d+)",
        "reads_map_to_antisense": r"Reads map to '-':\s*(\d+)",
        "non-splice_reads": r"Non-splice reads:\s*(\d+)",
        "splice_reads": r"Splice reads:\s*(\d+)",
        "reads_mapped_in_proper_pairs": r"Reads mapped in proper pairs:\s*(\d+)",
        "proper-paired_reads_map_to_different_chrom": r"Proper-paired reads map to different chrom:\s*(\d+)",
    }

    # intiate PE check
    is_paired_end = False

    # Go through files and parse data using regexes
    for f in self.find_log_files("rseqc/bam_stat"):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f["f"], re.MULTILINE)
            if r_search:
                d[k] = int(r_search.group(1))

        # Calculate some percentages
        if "total_records" in d:
            t = float(d["total_records"])
            if "mapq_gte_mapq_cut_unique" in d:
                d["unique_percent"] = (float(d["mapq_gte_mapq_cut_unique"]) / t) * 100.0
            if "reads_mapped_in_proper_pairs" in d:
                d["proper_pairs_percent"] = (float(d["reads_mapped_in_proper_pairs"]) / t) * 100.0

        if len(d) > 0:
            if f["s_name"] in self.bam_stat_data:
                log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"]))
            self.add_data_source(f, section="bam_stat")
            # Check if SE or PE
            if d["read_2"] != 0:
                is_paired_end = True
            self.bam_stat_data[f["s_name"]] = d

    # Filter to strip out ignored sample names
    self.bam_stat_data = self.ignore_samples(self.bam_stat_data)

    if len(self.bam_stat_data) > 0:
        # Write to file
        self.write_data_file(self.bam_stat_data, "multiqc_rseqc_bam_stat")

        # Add to general stats table
        self.general_stats_headers["proper_pairs_percent"] = {
            "title": "% Proper Pairs",
            "description": "% Reads mapped in proper pairs",
            "max": 100,
            "min": 0,
            "suffix": "%",
            "scale": "RdYlGn",
        }
        for s_name in self.bam_stat_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()

                # Only write if PE, i.e. there is something to write
                if is_paired_end:
                    self.general_stats_data[s_name].update(self.bam_stat_data[s_name])

        # Make dot plot of counts
        pconfig = {"id": "rseqc_bam_stat"}
        keys = OrderedDict()
        defaults = {
            "min": 0,
            "shared_key": "read_count",
            "decimalPlaces": 2,
            "modify": lambda x: float(x) / 1000000.0,
        }
        keys["total_records"] = dict(defaults, **{"title": "Total records"})
        keys["qc_failed"] = dict(defaults, **{"title": "QC failed"})
        keys["optical_pcr_duplicate"] = dict(
            defaults, **{"title": "Duplicates", "description": "Optical/PCR duplicate"}
        )
        keys["non_primary_hits"] = dict(defaults, **{"title": "Non primary hit"})
        keys["unmapped_reads"] = dict(defaults, **{"title": "Unmapped", "description": "Unmapped reads"})
        keys["mapq_lt_mapq_cut_non"] = dict(
            defaults, **{"title": "Non-unique", "description": "mapq < mapq_cut (non-unique)"}
        )
        keys["mapq_gte_mapq_cut_unique"] = dict(
            defaults, **{"title": "Unique", "description": "mapq >= mapq_cut (unique)"}
        )
        if is_paired_end:
            keys["read_1"] = dict(defaults, **{"title": "Read-1"})
            keys["read_2"] = dict(defaults, **{"title": "Read-2"})
        keys["reads_map_to_sense"] = dict(defaults, **{"title": "+ve strand", "description": "Reads map to '+'"})
        keys["reads_map_to_antisense"] = dict(defaults, **{"title": "-ve strand", "description": "Reads map to '-'"})
        keys["non-splice_reads"] = dict(defaults, **{"title": "Non-splice reads"})
        keys["splice_reads"] = dict(defaults, **{"title": "Splice reads"})
        if is_paired_end:
            keys["reads_mapped_in_proper_pairs"] = dict(
                defaults, **{"title": "Proper pairs", "description": "Reads mapped in proper pairs"}
            )
            keys["proper-paired_reads_map_to_different_chrom"] = dict(
                defaults, **{"title": "Different chrom", "description": "Proper-paired reads map to different chrom"}
            )

        self.add_section(
            name="Bam Stat",
            anchor="rseqc-bam_stat",
            description="All numbers reported in millions.",
            plot=beeswarm.plot(self.bam_stat_data, keys, pconfig),
        )

    # Return number of samples found
    return len(self.bam_stat_data)
Exemple #30
0
    def parse_samtools_flagstats(self):
        """ Find Samtools flagstat logs and parse their data """

        self.samtools_flagstat = dict()
        # for f in self.find_log_files('mapping/flagstats'):
        for f in glob('mapping/flagstats/*.tsv'):

            print("Made it here:"+f)
            parsed_data = parse_single_report(open(f))
            if len(parsed_data) > 0:
                if f['s_name'] in self.samtools_flagstat:
                    log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name']))
                self.add_data_source(f, section='flagstat')
                self.samtools_flagstat[f['s_name']] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat)

        if len(self.samtools_flagstat) > 0:

            # Write parsed report data to a file (restructure first)
            self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat')

            # General Stats Table
            flagstats_headers = dict()
            flagstats_headers['mapped_passed'] = {
                'title': '{} Reads Mapped'.format(config.read_count_prefix),
                'description': 'Reads Mapped in the bam file ({})'.format(config.read_count_desc),
                'min': 0,
                'modify': lambda x: x * config.read_count_multiplier,
                'shared_key': 'read_count',
                'placement' : 100.0
            }
            self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat')

            # Make dot plot of counts
            keys = OrderedDict()
            reads = {
                'min': 0,
                'modify': lambda x: float(x) * config.read_count_multiplier,
                 'suffix': '{} reads'.format(config.read_count_prefix),
                'decimalPlaces': 2,
                'shared_key': 'read_count'
            }
            keys['flagstat_total']              = dict(reads, title = 'Total Reads' )
            keys['total_passed']                = dict(reads, title = 'Total Passed QC' )
            keys['mapped_passed']               = dict(reads, title = 'Mapped' )

            if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()):
                keys['secondary_passed']        = dict(reads, title = 'Secondary Alignments' )

            if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()):
                keys['supplementary_passed']    = dict(reads, title = 'Supplementary Alignments' )

            keys['duplicates_passed']           = dict(reads, title = 'Duplicates' )
            keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' )
            keys['properly paired_passed']      = dict(reads, title = 'Properly Paired' )
            keys['with itself and mate mapped_passed'] = \
                                                  dict(reads, title = 'Self and mate mapped',
                                                              description = 'Reads with itself and mate mapped' )
            keys['singletons_passed']           = dict(reads, title = 'Singletons' )
            keys['with mate mapped to a different chr_passed'] = \
                                                  dict(reads, title = 'Mate mapped to diff chr',
                                                              description = 'Mate mapped to different chromosome' )
            keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \
                                                  dict(reads, title = 'Diff chr (mapQ >= 5)',
                                                              description = 'Mate mapped to different chromosome (mapQ >= 5)' )

            self.add_section (
                name = 'Samtools Flagstat',
                anchor = 'samtools-flagstat',
                description = 'This module parses the output from <code>samtools flagstat</code>. All numbers in millions.',
                plot = beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'})
            )

        # Return the number of logs that were found
        return len(self.samtools_flagstat)