Ejemplo n.º 1
0
    def chart_qc_cpg_dist(self): 

        # cpg distribution
        if len(self.mdata['qc_cpg_dist']) == 0:
            return 

        hdr = OrderedDict()
        pd = OrderedDict()
        sid = 'Genome'
        dd = list(self.mdata['qc_cpg_dist'].values())[0]
        pd[sid] = OrderedDict()
        for ctg in ['ExonicCpGs', 'RepeatCpGs', 'GenicCpGs', 'CGICpGs']:
            ctg1 = ctg.replace('CpGs','')
            hdr[ctg1] = {'max':100,'min':0,'suffix':'%'}
            pd[sid][ctg1] = float(dd[ctg]['uc']) / dd['TotalCpGs']['uc'] * 100

        hdr['Exonic']['description'] = 'Exonic CpGs'
        hdr['Repeat']['description'] = 'Repeat-Masked CpGs'
        hdr['Genic']['description']  = 'Genic CpGs'
        hdr['CGI']['description']    = 'CpG Island CpGs'
        for sid, dd in self.mdata['qc_cpg_dist'].items():
            pd[sid] = OrderedDict()
            for ctg in ['ExonicCpGs', 'RepeatCpGs', 'GenicCpGs', 'CGICpGs']:
                ctg1 = ctg.replace('CpGs','')
                pd[sid][ctg1] = float(dd[ctg]['uc']) / dd['TotalCpGs']['uc'] * 100

        self.add_section(
            name = 'CpG Coverage vs Genomic Features',
            anchor = 'biscuit-coverage-cpg-dist',
            description = "The top row shows how CpGs breaks down to different categories. Each other row shows the how CpGs uniquely covered by the given data breaks down to these categories. It is the fraction of CpGs in the given category out of all CpGs covered by the data.",
            plot = table.plot(pd, hdr)
        )

        pd = dict([(sid, dd['cgi_coverage']) for sid, dd in self.mdata['qc_cpg_dist'].items()])
        self.add_section(
            name = 'CpG Island Coverage',
            anchor = 'biscuit-coverage-cgi',
            description = "Each row shows the percentage of CpG islands (out of all CpG islands in the genome) that are covered in different numbers of CpGs. Coverage is based on reads with mapQ >= 40.",
            plot = table.plot(pd, OrderedDict([
                ('one', {'title':'>=1', 
                 'suffix':'%', 'description':'CpG islands with at least one CpG covered'}),
                ('three', {'title':'>=2', 
                 'suffix':'%', 'description':'CpG islands with at least three CpGs covered'}),
                ('five', {'title':'>=5', 
                 'suffix':'%', 'description':'CpG islands with at least five CpGs covered'}),
                ('ten', {'title':'>=10', 
                 'suffix':'%', 'description':'CpG islands with at least ten CpGs covered'}),
            ]), {'id':'cgi-cov-table'})
        )
Ejemplo n.º 2
0
    def make_basic_table(self, file_type):
        """  Create table of key-value items in 'file_type'.
        """

        table_data = {sample: items['kv']
                for sample, items
                in self.mod_data[file_type].items()
        }
        table_headers = {column_header: {
                    'title': column_header,
                    'description': description,
                }
                for column_header, description
                in file_types[file_type]['kv_descriptions'].items()
        }
        tconfig = {
            'namespace': 'BBTools'
        }
        for sample in table_data:
            for key, value in table_data[sample].items():
                try:
                    table_data[sample][key] = float(value)
                except ValueError:
                    pass
        return table.plot(table_data, table_headers, tconfig)
Ejemplo n.º 3
0
    def index_metrics_details_table(self,data):
        headers = OrderedDict()
        headers['% Read Identified (PF)'] = {
            'title': '% Reads Identified (PF)',
            'description': 'The number of reads (only includes Passing Filter reads) mapped to this index.',
            'suffix': '%',
        }
        headers['Index 1 (I7)'] = {
            'title': 'Index 1 (I7)',
            'description': 'The sequence for the first Index Read.'
        }
        headers['Index 2 (I5)'] = {
            'title': 'Index 2 (I5)',
            'description': 'The sequence for the second Index Read.'
        }

        table_config = {
            'namespace': 'interop',
            'id': 'interop-indexmetrics-details-table',
            'table_title': 'Index Read Statistics Details',
            'col1_header': 'Run - Sample - Lane',
        }

        tdata = {}
        for s_name in data:
            for key in data[s_name]['details']:
                tdata["{} - {}".format(s_name,key)]=data[s_name]['details'][key]

        return table.plot(tdata, headers, table_config)
Ejemplo n.º 4
0
    def table_qfiltered(self):
        """ Table showing stats for q-filtered reads """

        description = 'MinIONQC statistics for quality filtered reads. ' + \
                        'Quailty threshold used: {}.'.format(', '.join(list(self.q_threshold_list)))
        if len(self.q_threshold_list) > 1:
            description += '''
            <div class="alert alert-warning">
              <span class="glyphicon glyphicon-warning-sign"></span>
              <strong>Warning!</strong> More than one quality thresholds were present.
            </div>
            '''
            log.warning('More than one quality thresholds were present. Thresholds: {}.'.format(', '.join(list(self.q_threshold_list))))

        self.add_section (
            name = 'Stats: Quality filtered reads',
            anchor = 'minionqc-stats-qFilt',
            description = description,
            plot = table.plot(
                self.qfilt_data,
                self.headers_to_use(),
                {
                    'namespace': 'MinIONQC',
                    'id': 'minionqc-stats-qFilt-table',
                    'table_title': 'MinIONQC Stats: Quality filtered reads'
                }
            )
        )
Ejemplo n.º 5
0
    def chart_retention_rate_byread(self):

        mdata_byread = {}
        for sid, dd in self.mdata['retention_rate_byread'].items():
            sid = sid.replace('_totalReadConversionRate','')
            mdata_byread[sid] = dd

        mdata_bybase = {}
        for sid, dd in self.mdata['retention_rate_bybase'].items():
            sid = sid.replace('_totalBaseConversionRate','')
            mdata_bybase[sid] = dd

        pdata = {}
        for sid, dd in mdata_byread.items():
            pdata[sid] = dict(list(dd.items()) + list(mdata_bybase[sid].items()))

        pheader = OrderedDict()
        pheader['ca'] = {'title':'r.CpA', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpA', 'suffix':'%'}
        pheader['cc'] = {'title':'r.CpC', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpC', 'suffix':'%'}
        pheader['cg'] = {'title':'r.CpG', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpG', 'suffix':'%'}
        pheader['ct'] = {'title':'r.CpT', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpT', 'suffix':'%'}
        pheader['bca'] = {'title':'b.CpA', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpA', 'suffix':'%'}
        pheader['bcc'] = {'title':'b.CpC', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpC', 'suffix':'%'}
        pheader['bcg'] = {'title':'b.CpG', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpG', 'suffix':'%'}
        pheader['bct'] = {'title':'b.CpT', 'format':'{:,.2g}', 'min':0, 'max':100, 'description':'CpT', 'suffix':'%'}

        self.add_section(
            name = 'Cytosine Retention',
            anchor = 'biscuit-retention',
            description = "This plot shows cytosine retention rate. `r.` stands for read-averaging and `b.` stands for 'base-averaging.",
            helptext = "**Cytosine retention rate** is `1.0 - cytosine conversion rate`. Assuming full (complete but not over) bisulfite conversion, **cytosine retention rate** is the average cytosine modification (including 5mC, 5hmC etc) rate.",
            plot = table.plot(pdata, pheader))
Ejemplo n.º 6
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name', c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name = modname,
            anchor = mod['config'].get('section_anchor', c_id),
            href = mod['config'].get('section_href'),
            info = mod['config'].get('description')
        )

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section( plot = table.plot(mod['data'], headers, pconfig) )
            self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) )

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) )

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section( plot = linegraph.plot(mod['data'], pconfig) )

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section( plot = scatter.plot(mod['data'], pconfig) )

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) )

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section( plot = beeswarm.plot(mod['data'], pconfig) )

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section( content = mod['data'] )

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section( content = mod['data'] )

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
Ejemplo n.º 7
0
    def make_basic_table(self, file_type):
        """  Create table of key-value items in 'file_type'.
        """

        table_data = {sample: items['kv']
                for sample, items
                in self.mod_data[file_type].items()
        }
        table_headers = {}
        for column_header, (description, header_options) in file_types[file_type]['kv_descriptions'].items():
            table_headers[column_header] = {
                    'rid': '{}_{}_bbmstheader'.format(file_type, column_header),
                    'title': column_header,
                    'description': description,
            }
            table_headers[column_header].update(header_options)

        tconfig = {
            'id': file_type + '_bbm_table',
            'namespace': 'BBTools'
        }
        for sample in table_data:
            for key, value in table_data[sample].items():
                try:
                    table_data[sample][key] = float(value)
                except ValueError:
                    pass
        return table.plot(table_data, table_headers, tconfig)
Ejemplo n.º 8
0
    def prokka_table(self):
        """ Make basic table of the annotation stats """

        # Specify the order of the different possible categories
        headers = OrderedDict()
        headers['organism'] = {
                'title': 'Organism',
                'description': 'Organism name',
        }
        headers['contigs'] = {
                'title': '# contigs',
                'description': 'Number of contigs in assembly',
                'format': '{:i}',
        }
        headers['bases'] = {
                'title': '# bases',
                'description': 'Number of nucleotide bases in assembly',
                'format': '{:i}',
        }
        headers['CDS'] = {
                'title': '# CDS',
                'description': 'Number of annotated CDS',
                'format': '{:i}',
        }
        headers['rRNA'] = {
                'title': '# rRNA',
                'description': 'Number of annotated rRNA',
                'format': '{:i}',
        }
        headers['tRNA'] = {
                'title': '# tRNA',
                'description': 'Number of annotated tRNA',
                'format': '{:i}',
        }
        headers['tmRNA'] = {
                'title': '# tmRNA',
                'description': 'Number of annotated tmRNA',
                'format': '{:i}',
        }
        headers['misc_RNA'] = {
                'title': '# misc RNA',
                'description': 'Number of annotated misc. RNA',
                'format': '{:i}',
        }
        headers['sig_peptide'] = {
                'title': '# sig_peptide',
                'description': 'Number of annotated sig_peptide',
                'format': '{:i}',
        }
        headers['repeat_region'] = {
                'title': '# CRISPR arrays',
                'description': 'Number of annotated CRSIPR arrays',
                'format': '{:i}',
        }
        table_config = {
            'namespace': 'prokka',
            'min': 0,
        }

        return table.plot(self.prokka, headers, table_config)
Ejemplo n.º 9
0
    def clusterflow_commands_table (self):
        """ Make a table of the Cluster Flow commands """

        # I wrote this when I was tired. Sorry if it's incomprehensible.

        desc = '''Every Cluster Flow run will have many different commands.
            MultiQC splits these by whitespace, collects by the tool name
            and shows the first command found. Any terms not found in <em>all</em> subsequent
            calls are replaced with <code>[variable]</code>
            <em>(typically input and ouput filenames)</em>. Each column is for one Cluster Flow run.'''

        # Loop through pipelines
        tool_cmds = OrderedDict()
        headers = dict()
        for pipeline_id, commands in self.clusterflow_commands.items():
            headers[pipeline_id] = {'scale': False}
            self.var_html = '<span style="background-color:#dedede; color:#999;">[variable]</span>'
            tool_cmd_parts = OrderedDict()
            for cmd in commands:
                s = cmd.split()
                tool = self._guess_cmd_name(s)
                if tool not in tool_cmd_parts.keys():
                    tool_cmd_parts[tool] = list()
                tool_cmd_parts[tool].append(s)


            for tool, cmds in tool_cmd_parts.items():
                cons_cmd = self._replace_variable_chunks(cmds)
                # Try again with first two blocks if all variable
                variable_count = cons_cmd.count(self.var_html)
                if variable_count == len(cmds[0]) - 1 and len(cmds[0]) > 2:
                    for subcmd in set([x[1] for x in cmds]):
                        sub_cons_cmd = self._replace_variable_chunks([cmd for cmd in cmds if cmd[1] == subcmd])
                        tool = "{} {}".format(tool, subcmd)
                        if tool not in tool_cmds:
                            tool_cmds[tool] = dict()
                        tool_cmds[tool][pipeline_id] = '<code style="white-space:nowrap;">{}</code>'.format(" ".join(sub_cons_cmd) )
                else:
                    if tool not in tool_cmds:
                        tool_cmds[tool] = dict()
                    tool_cmds[tool][pipeline_id] = '<code style="white-space:nowrap;">{}</code>'.format(" ".join(cons_cmd) )

        table_config = {
            'namespace': 'Cluster Flow',
            'id': 'clusterflow-commands-table',
            'table_title': 'Cluster Flow Commands',
            'col1_header': 'Tool',
            'sortRows': False,
            'no_beeswarm': True
        }
        self.add_section (
            name = 'Commands',
            anchor = 'clusterflow-commands',
            description = desc,
            plot = table.plot(tool_cmds, headers, table_config)
        )
Ejemplo n.º 10
0
    def run_metrics_summary_table(self, data):

        headers = OrderedDict()
        headers['Yield'] = {
            'rid': 'summary_Yield',
            'title': '{}p Yield'.format(config.base_count_prefix),
            'description': 'The number of bases sequenced ({} base pairs over all "usable cycles"'.format(config.base_count_desc),
            'scale': 'PuOr',
            'shared_key': 'base_count',
            'modify': lambda x: (x*1000000000.0) * config.base_count_multiplier, # number is already in gigabases
        }
        headers['Aligned'] = {
            'rid': 'summary_Aligned',
            'title': 'Aligned (%)',
            'description': 'The percentage of the sample that aligned to the PhiX genome',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'PiYG'
        }
        headers['Error Rate'] = {
            'title': 'Error Rate (%)',
            'description': '',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd'
        }
        headers['Intensity C1'] = {
            'rid': 'summary_Intensity_C1',
            'title': 'Intensity Cycle 1',
            'description': 'The intensity statistic at cycle 1.',
        }
        headers['%>=Q30'] = {
            'rid': 'summary_Q30',
            'title': '% >= Q30',
            'description': 'Percentage of reads with quality phred score of 30 or above',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'RdYlGn'
        }
        table_config = {
            'namespace': 'interop',
            'id': 'interop-runmetrics-summary-table',
            'table_title': 'Read metrics summary',
            'col1_header': 'Run - Read',
        }

        tdata = {}
        for s_name in data:
            for key in data[s_name]['summary']:
                tdata["{} - {}".format(s_name,key)]=data[s_name]['summary'][key]

        return table.plot(tdata, headers, table_config)
Ejemplo n.º 11
0
 def _bcbio_umi_table(self, parsed_data):
     keys = OrderedDict()
     keys['umi_consensus_mapped'] = {
         'title': 'Consensus mapped',
         'description': 'Count of UMI consensus reads mapped',
         'modify': lambda x: x * config.read_count_multiplier,
         'shared_key': 'read_count',
         'format': self.read_format,
     }
     keys['umi_consensus_pct'] = {
         'title': 'Consensus reduction',
         'description': 'Percent of original reads removed by consensus',
         'min': 0,
         'max': 100,
         'suffix': '%',
         'format': '{:,.1f}',
     }
     keys['umi_baseline_all'] = {
         'title': 'Orig. total',
         'description': 'Total reads in the original BAM',
         'modify': lambda x: x * config.read_count_multiplier,
         'shared_key': 'read_count',
         'format': self.read_format,
     }
     keys['umi_baseline_mapped'] = {
         'title': "Orig. mapped",
         'description': 'Count of original mapped reads',
         'modify': lambda x: x * config.read_count_multiplier,
         'shared_key': 'read_count',
         'format': self.read_format,
     }
     keys['umi_baseline_duplicate_pct'] = {
         'title': 'Orig. dup',
         'description': 'Percentage original duplicates',
         'min': 0,
         'max': 100,
         'suffix': '%',
         'format': '{:,.1f}',
     }
     keys['umi_reduction_median'] = {
         'title': 'Dup. reduction (median)',
         'description': 'Reduction in duplicates per position by UMIs (median)',
         'suffix': 'x',
         'format': '{:n}'
     }
     keys['umi_reduction_max'] = {
         'title': 'Dup. reduction (max)',
         'description': 'Reduction in duplicates per position by UMIs (maximum)',
         'suffix': 'x',
         'format': '{:n}'
     }
     return {'name': 'UMI barcode statistics',
             'anchor': 'umi-stats',
             'plot': table.plot(parsed_data, keys)}
Ejemplo n.º 12
0
 def lane_stats_table(self):
     """ Return a table with overview stats for each bcl2fastq lane for a single flow cell """
     headers = OrderedDict()
     headers['total_yield'] = {
         'title': '{} Total Yield'.format(config.base_count_prefix),
         'description': 'Number of bases ({})'.format(config.base_count_desc),
         'min': 0,
         'scale': 'Greens',
         'modify': lambda x: x * config.base_count_multiplier,
         'shared_key': 'base_count'
     }
     headers['total'] = {
         'title': '{} Total Clusters'.format(config.read_count_prefix),
         'description': 'Total number of clusters for this lane ({})'.format(config.read_count_desc),
         'min': 0,
         'scale': 'Blues',
         'modify': lambda x: x * config.read_count_multiplier,
         'shared_key': 'read_count'
     }
     headers['percent_Q30'] = {
         'title': '% bases &ge; Q30',
         'description': 'Percentage of bases with greater than or equal to Q30 quality score',
         'suffix': '%',
         'max': 100,
         'min': 0,
         'scale': 'RdYlGn'
     }
     headers['mean_qscore'] = {
         'title': 'Mean Quality',
         'description': 'Average phred qualty score',
         'min': 0,
         'scale': 'Spectral'
     }
     headers['percent_perfectIndex'] = {
         'title': '% Perfect Index',
         'description': 'Percent of reads with perfect index (0 mismatches)',
         'max': 100,
         'min': 0,
         'scale': 'RdYlGn',
         'suffix': '%'
     }
     table_config = {
         'namespace': 'bcl2fastq',
         'id': 'bcl2fastq-lane-stats-table',
         'table_title': 'bcl2fastq Lane Statistics',
         'col1_header': 'Run ID - Lane',
         'no_beeswarm': True
     }
     return table.plot(self.bcl2fastq_bylane, headers, table_config)
Ejemplo n.º 13
0
    def index_metrics_summary_table(self,data):
        headers = OrderedDict()
        headers['Total Reads'] = {
            'title': '{} Reads'.format(config.read_count_prefix),
            'description': 'The total number of reads for this lane ({})'.format(config.read_count_desc),
            'modify': lambda x: float(x) * config.read_count_multiplier,
            'format': '{:,.2f}',
            'shared_key': 'read_count'
        }
        headers['PF Reads'] = {
            'title': '{} PF Reads'.format(config.read_count_prefix),
            'description': 'The total number of passing filter reads for this lane ({})'.format(config.read_count_desc),
            'modify': lambda x: float(x) * config.read_count_multiplier,
            'format': '{:,.2f}',
            'shared_key': 'read_count'
        }
        headers['% Read Identified (PF)'] = {
            'rid': 'summary_reads_identified_pf',
            'title': '% Reads Identified (PF)',
            'description': 'The total fraction of passing filter reads assigned to an index.',
            'suffix': '%',
        }
        headers['CV'] = {
            'title': 'CV',
            'description': 'The coefficient of variation for the number of counts across all indexes.',
            'format': '{:.,2f}',
        }
        headers['Min'] = {
            'title': 'Min',
            'description': 'The lowest representation for any index.'
        }
        headers['Max'] = {
            'title': 'Max',
            'description': 'The highest representation for any index.'
        }
        table_config = {
            'namespace': 'interop',
            'id': 'interop-indexmetrics-summary-table',
            'table_title': 'Index Read Statistics Summary',
            'col1_header': 'Run - Lane',
        }

        tdata = {}
        for s_name in data:
            for key in data[s_name]['summary']:
                tdata["{} - {}".format(s_name,key)]=data[s_name]['summary'][key]

        return table.plot(tdata, headers, table_config)
Ejemplo n.º 14
0
    def table_qALL(self):
        """ Table showing stats for all reads """

        self.add_section (
            name = 'Stats: All reads',
            anchor = 'minionqc-stats-qAll',
            description = 'MinIONQC statistics for all reads',
            plot = table.plot(
                self.minionqc_data,
                self.headers_to_use(),
                {
                    'namespace': 'MinIONQC',
                    'id': 'minionqc-stats-qAll-table',
                    'table_title': 'MinIONQC Stats: All reads'
                }
            )
        )
Ejemplo n.º 15
0
def comp_overlap_table(data):
    """Build a table from the comp overlaps output."""
    headers = OrderedDict()
    headers['comp_rate'] = {
        'title': 'Compare rate',
        'description': 'Ratio of known variants found in the reference set.',
        'namespace': 'GATK',
        'min': 0,
        'max': 100,
        'suffix': '%',
        'format': '{:,.2f}',
        'scale': 'Blues',
    }
    headers['concordant_rate'] = {
        'title': 'Concordant rate',
        'description': 'Ratio of variants matching alleles in the reference set.',
        'namespace': 'GATK',
        'min': 0,
        'max': 100,
        'suffix': '%',
        'format': '{:,.2f}',
        'scale': 'Blues',
    }
    headers['eval_variants'] = {
        'title': 'M Evaluated variants',
        'description': 'Number of called variants (millions)',
        'namespace': 'GATK',
        'min': 0,
        'modify': lambda x: float(x) / 1000000.0
    }
    headers['known_sites'] = {
        'title': 'M Known sites',
        'description': 'Number of known variants (millions)',
        'namespace': 'GATK',
        'min': 0,
        'modify': lambda x: float(x) / 1000000.0
    }
    headers['novel_sites'] = {
        'title': 'M Novel sites',
        'description': 'Number of novel variants (millions)',
        'namespace': 'GATK',
        'min': 0,
        'modify': lambda x: float(x) / 1000000.0
    }
    table_html = table.plot(data, headers, {'id': 'gatk_compare_overlap', 'table_title': 'GATK - Compare Overlap'})
    return table_html
Ejemplo n.º 16
0
    def clusterflow_pipelines_section(self):
        """ Generate HTML for section about pipelines, generated from
        information parsed from run files. """
        data = dict()
        pids_guessed = ''
        for f,d in self.clusterflow_runfiles.items():
            pid = d.get('pipeline_id', 'unknown')
            if d.get('pipeline_id_guess', False) is True:
                pid += '*'
                pids_guessed = ' Project IDs with an asterisk may be inaccurate.'
            # Count the number of files going into the first module
            num_starting_files = 0
            for step_name, files in d.get('files',{}).items():
                if step_name.startswith('start'):
                    num_starting_files += len(files)
            # Reformat the date so that column sorting works nicely
            if 'pipeline_start_dateparts' in d:
                dt = d['pipeline_start_dateparts']
                d['pipeline_start'] = '{}-{:02d}-{:02d} {:02d}:{:02d}'.format(dt['year'], dt['month'], dt['day'], dt['hour'], dt['minute'])
            if pid not in data:
                data[pid] = d
                data[pid]['num_starting_files'] = int(num_starting_files)
            else:
                data[pid]['num_starting_files'] += int(num_starting_files)

        headers = OrderedDict()
        headers['pipeline_name'] = {'title': 'Pipeline Name'}
        headers['pipeline_start'] = {'title': 'Date Started', 'description': 'Date and time that pipeline was started (YYYY-MM-DD HH:SS)'}
        headers['genome'] = {'title': 'Genome ID', 'description': 'ID of reference genome used'}
        headers['num_starting_files'] = {'title': '# Starting Files', 'format': '{:,.0f}', 'description': 'Number of input files at start of pipeline run.'}
        table_config = {
            'namespace': 'Cluster Flow',
            'id': 'clusterflow-pipelines-table',
            'table_title': 'Cluster Flow Pipelines',
            'col1_header': 'Pipeline ID',
            'no_beeswarm': True,
            'save_file': True
        }
        self.add_section (
            name = 'Pipelines',
            anchor = 'clusterflow-pipelines',
            description = 'Information about pipelines is parsed from <code>*.run</code> files. {}'.format(pids_guessed),
            plot = table.plot(data, headers, table_config),
            content = self.clusterflow_pipelines_printout()
        )
Ejemplo n.º 17
0
 def get_damage_stats(self, fnames):
     """Summarize statistics on samples with DNA damage.
     """
     data = {}
     keys = set([])
     for f in self.find_log_files(fnames):
         with open(os.path.join(f['root'], f['fn'])) as in_handle:
             cur = yaml.safe_load(in_handle)
             keys = keys | set(cur["changes"].keys())
             data[cur["sample"]] = cur["changes"]
     if data:
         cols = OrderedDict()
         for k in sorted(list(keys), reverse=True):
             cols[k] = {
                 "title": k,
                 "format": "{:n}"
             }
         return {"name": "DNA damage and bias filtering",
                 "anchor": "damage-stats",
                 "plot": table.plot(data, cols)}
Ejemplo n.º 18
0
    def chart_qc_cv(self): 

        # sequencing depth and uniformity
        pd = OrderedDict()
        for sid, dd in self.mdata['qc_cv'].items():
            pd[sid] = OrderedDict()
            for ctg in ['all','all_topgc','all_botgc']:
                if ctg in dd:
                    pd[sid]['cv_'+ctg] = dd[ctg]['cv']
                    pd[sid]['mu_'+ctg] = dd[ctg]['mu']

        for sid, dd in self.mdata['qc_cpg_cv'].items():
            if sid not in pd:
                pd[sid] = OrderedDict()
            for ctg in ['cpg','cpg_topgc','cpg_botgc']:
                if ctg in dd:
                    pd[sid]['cv_'+ctg] = dd[ctg]['cv']
                    pd[sid]['mu_'+ctg] = dd[ctg]['mu']

        pheader = OrderedDict()
        pheader['mu_all'] = {'title':'Mu.gnm','description':'Whole Genome'}
        pheader['mu_all_topgc'] = {'title':'Mu.high.gc','description':'Top Decile in GC Content'}
        pheader['mu_all_botgc'] = {'title':'Mu.low.gc','description':'Bottom Decile in GC Content'}
        pheader['cv_all'] = {'title':'CV.gnm','description':'Whole Genome'}
        pheader['cv_all_topgc'] = {'title':'CV.high.gc','description':'Top Decile in GC Content'}
        pheader['cv_all_botgc'] = {'title':'CV.low.gc','description':'Bottom Decile in GC Content'}
        pheader['mu_cpg'] = {'title':'CG.Mu.gnm','description':'All CpGs'}
        pheader['mu_cpg_topgc'] = {'title':'CG.Mu.high.gc','description':'Top Decile in GC Content'}
        pheader['mu_cpg_botgc'] = {'title':'CG.Mu.low.gc','description':'Bottom Decile in GC Content'}
        pheader['cv_cpg'] = {'title':'CG.CV.gnm','description':'All CpGs'}
        pheader['cv_cpg_topgc'] = {'title':'CG.CV.high.gc','description':'Top Decile in GC Content'}
        pheader['cv_cpg_botgc'] = {'title':'CG.CV.low.gc','description':'Bottom Decile in GC Content'}

        self.add_section(
            name = 'Sequencing Depth',
            anchor = 'biscuit-seq-depth',
            description = "This plot shows sequence depth mean and uniformity measured in Coefficient of Variation (mu/sigma), mapQ>40 only. CG.* shows the depth on CpG only. GC contents were measured on 100bp non-overlapping windows.",
            plot = table.plot(pd, pheader))
Ejemplo n.º 19
0
 def get_viral_stats(self, fnames):
     """Provide counts of top viral hits for samples.
     """
     to_show = 5
     data = {}
     for f in self.find_log_files(fnames):
         with open(os.path.join(f['root'], f['fn'])) as in_handle:
             sample_name = in_handle.readline().strip().split()[-1]
             counts = []
             for line in in_handle:
                 contig, count = line.strip().split("\t")
                 counts.append((count, contig))
             counts.sort(reverse=True)
             if counts:
                 data[sample_name] = {"counts": ", ".join(["%s (%s)" % (v, c) for (c, v) in counts[:to_show]])}
     keys = OrderedDict()
     keys["counts"] = {
         "title": "Virus (count)",
         "description": "Top %s viral sequences, with counts, found in unmapped reads" % to_show
     }
     if data:
         return {"name": "Viral mapping read counts",
                 "anchor": "viral-counts",
                 "plot": table.plot(data, keys)}
Ejemplo n.º 20
0
    def __init__(self):
        """ MultiQC module for processing hap.py output logs """
        super(MultiqcModule, self).__init__(
            name='hap.py',
            anchor='happy',
            href='https://github.com/Illumina/hap.py',
            info=""" is a set of programs based on htslib to benchmark variant calls against gold standard truth datasets. """ +
                 """The default shown fields should give the best overview of quality, but there are many other hidden """ +
                 """fields available. No plots are generated, as hap.py is generally run on single control samples (NA12878, etc.)""" +
                 """<br/><br/>""" +
                 """Ideally, precision, recall and F1 Score should all be as close to 1 as possible."""
        )

        self.happy_seen = set()
        self.happy_data = dict()

        n_files = 0
        for f in self.find_log_files("happy", filehandles=True):
            f['s_name'] = self.clean_s_name(f['s_name'], f['root'])

            n_files += self.parse_log(f)
            self.add_data_source(f)

        if n_files == 0:
            raise UserWarning

        log.info("Found {} reports".format(n_files))

        if len(self.happy_data) > 0:
            self.write_data_file(self.happy_data, 'multiqc_happy_data', data_format="json")

            self.add_section(
                name = "hap.py",
                anchor = "happy-plot",
                plot = table.plot(self.happy_data, gen_headers())
            )
Ejemplo n.º 21
0
    def nanostat_stats_table(self, stat_type):
        """Take the parsed stats from the Kallisto report and add it to the
        basic stats table at the top of the report"""

        headers_base = OrderedDict()
        headers_base["Active channels"] = {
            "title": "Active channels",
            "description": "Active channels",
            "scale": "Greens",
            "format": "{:,.0f}",
        }
        headers_base["Median read length"] = {
            "title": f"Median length",
            "description": f"Median read length (bp)",
            "suffix": " bp",
            "format": "{:,.0f}",
            "shared_key": "nucleotides",
            "scale": "BuPu",
        }
        headers_base["Mean read length"] = {
            "title": f"Mean length",
            "description": f"Mean read length (bp)",
            "suffix": " bp",
            "scale": "Purples",
            "format": "{:,.0f}",
            "shared_key": "nucleotides",
            "hidden": True,
        }
        headers_base["Read length N50"] = {
            "title": "Read N50",
            "description": "Read length N50",
            "format": "{:,.0f}",
            "suffix": " bp",
            "scale": "RdPu",
        }
        headers_base["Median read quality"] = {
            "title": "Median Qual",
            "description": "Median read quality (Phred scale)",
            "shared_key": "phred_score",
            "scale": "RdYlGn",
        }
        headers_base["Mean read quality"] = {
            "title": "Mean Qual",
            "description": "Mean read quality (Phred scale)",
            "scale": "PiYG",
            "shared_key": "phred_score",
            "hidden": True,
        }
        headers_base["Median percent identity"] = {
            "title": "Median Identity",
            "description": "Median percent identity",
            "min": 0,
            "max": 100,
            "suffix": "%",
            "scale": "RdYlBu",
            "shared_key": "percent_identity",
        }
        headers_base["Average percent identity"] = {
            "title": "Mean Identity",
            "description": "Average percent identity",
            "max": 100,
            "suffix": "%",
            "scale": "Spectral",
            "shared_key": "percent_identity",
            "hidden": True,
        }
        headers_base["Number of reads"] = {
            "title": f"# Reads ({config.long_read_count_prefix})",
            "description": f"Number of reads ({config.long_read_count_desc})",
            "modify": lambda x: x * config.long_read_count_multiplier,
            "shared_key": "long_read_count",
            "scale": "YlGn",
        }
        headers_base["Total bases"] = {
            "title": f"Total Bases ({config.base_count_prefix})",
            "description": f"Total bases ({config.base_count_desc})",
            "modify": lambda x: x * config.base_count_multiplier,
            "shared_key": "base_count",
            "scale": "BrBG",
        }
        headers_base["Total bases aligned"] = {
            "title": f"Aligned Bases ({config.base_count_prefix})",
            "description": f"Total bases aligned ({config.base_count_desc})",
            "modify": lambda x: x * config.base_count_multiplier,
            "shared_key": "base_count",
            "scale": "PuOr",
        }

        # Add the stat_type suffix
        headers = OrderedDict()
        for k in headers_base:
            key = f"{k}_{stat_type}"
            headers[key] = headers_base.get(k, dict()).copy()

        # Table config
        table_config = {
            "namespace": "NanoStat",
            "id": "nanostat_{}_stats_table".format(stat_type.replace(" ",
                                                                     "_")),
            "table_title": f"NanoStat {stat_type}",
        }

        # Add the report section
        description = ""
        if stat_type == "fasta":
            description = "NanoStat statistics from FASTA files."
        if stat_type == "fastq":
            description = "NanoStat statistics from FastQ files."
        if stat_type == "aligned":
            description = "NanoStat statistics from BAM files."
        if stat_type == "seq summary":
            description = "NanoStat statistics from albacore or guppy summary files."

        self.add_section(
            name="{} stats".format(stat_type.replace("_", " ").capitalize()),
            anchor="nanostat_{}_stats".format(stat_type.replace(" ", "_")),
            description=description,
            plot=table.plot(self.nanostat_data, headers, table_config),
        )
Ejemplo n.º 22
0
    def add_coverage_metrics(self):
        data_by_phenotype_by_sample = defaultdict(dict)

        for f in self.find_log_files("dragen/coverage_metrics"):
            data_by_phenotype = parse_coverage_metrics(f)
            if f["s_name"] in data_by_phenotype_by_sample:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f["s_name"]))
            self.add_data_source(f, section="stats")
            data_by_phenotype_by_sample[f["s_name"]].update(data_by_phenotype)

        # Filter to strip out ignored sample names:
        data_by_phenotype_by_sample = self.ignore_samples(
            data_by_phenotype_by_sample)
        # Merge tumor and normal data:
        data_by_sample = defaultdict(dict)
        for sn in data_by_phenotype_by_sample:
            for phenotype in data_by_phenotype_by_sample[sn]:
                new_sn = sn
                if phenotype == "normal":
                    new_sn = sn + "_normal"
                data_by_sample[new_sn] = data_by_phenotype_by_sample[sn][
                    phenotype]
        if not data_by_sample:
            return set()
        all_metric_names = set()
        for sn, sdata in data_by_sample.items():
            for m in sdata.keys():
                all_metric_names.add(m)
        gen_stats_headers, own_tabl_headers = make_headers(
            all_metric_names, COV_METRICS)

        self.general_stats_addcols(data_by_sample,
                                   gen_stats_headers,
                                   namespace=NAMESPACE)

        self.add_section(
            name="Coverage metrics",
            anchor="dragen-cov-metrics",
            description="""
            Coverage metrics over a region (where the region can be a target region, 
            a QC coverage region, or the whole genome). Press the `Help` button for details.
            """,
            helptext="""
            The following criteria are used when calculating coverage:
            
            * Duplicate reads and clipped bases are ignored.
            * Only reads with `MAPQ` > `min MAPQ` and bases with `BQ` > `min BQ` are considered
            
            Considering only bases usable for variant calling, _i.e._ excluding:
            
            1. Clipped bases
            2. Bases in duplicate reads
            3. Reads with `MAPQ` < `min MAPQ` (default `20`)
            4. Bases with `BQ` < `min BQ` (default `10`)
            5. Reads with `MAPQ` = `0` (multimappers)
            6. Overlapping mates are double-counted
            """,
            plot=table.plot(data_by_sample,
                            own_tabl_headers,
                            pconfig={"namespace": NAMESPACE}),
        )
        return data_by_sample.keys()
Ejemplo n.º 23
0
def parse_reports(self):
    """ Find Picard HsMetrics reports and parse their data """

    # Set up vars
    self.picard_HsMetrics_data = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files("picard/hsmetrics", filehandles=True):
        parsed_data = dict()
        s_name = None
        keys = None
        commadecimal = None
        for l in f["f"]:
            # New log starting
            if "CalculateHsMetrics" in l or "CollectHsMetrics" in l and "INPUT" in l:
                s_name = None
                keys = None

                # Pull sample name from input
                fn_search = re.search(r"INPUT(?:=|\s+)(\[?[^\s]+\]?)",
                                      l,
                                      flags=re.IGNORECASE)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1).strip("[]"))
                    s_name = self.clean_s_name(s_name, f["root"])
                    parsed_data[s_name] = dict()

            if s_name is not None:
                if "HsMetrics" in l and "## METRICS CLASS" in l:
                    keys = f["f"].readline().strip("\n").split("\t")
                elif keys:
                    vals = l.strip("\n").split("\t")
                    if len(vals) == len(keys):
                        j = "NA"
                        if keys[0] == "BAIT_SET":
                            j = vals[0]
                        parsed_data[s_name][j] = dict()
                        # Check that we're not using commas for decimal places
                        if commadecimal is None:
                            for i, k in enumerate(keys):
                                if k.startswith("PCT_"):
                                    if "," in vals[i]:
                                        commadecimal = True
                                    else:
                                        commadecimal = False
                        for i, k in enumerate(keys):
                            try:
                                if commadecimal:
                                    vals[i] = vals[i].replace(".", "")
                                    vals[i] = vals[i].replace(",", ".")
                                parsed_data[s_name][j][k] = float(vals[i])
                            except ValueError:
                                parsed_data[s_name][j][k] = vals[i]
                    else:
                        s_name = None
                        keys = None

        # Remove empty dictionaries
        for s_name in list(parsed_data.keys()):
            for j in parsed_data[s_name].keys():
                if len(parsed_data[s_name][j]) == 0:
                    parsed_data[s_name].pop(j, None)
            if len(parsed_data[s_name]) == 0:
                parsed_data.pop(s_name, None)

        # Manipulate sample names if multiple baits found
        for s_name in parsed_data.keys():
            for j in parsed_data[s_name].keys():
                this_s_name = s_name
                if len(parsed_data[s_name]) > 1:
                    this_s_name = "{}: {}".format(s_name, j)
                if this_s_name in self.picard_HsMetrics_data:
                    log.debug(
                        "Duplicate sample name found in {}! Overwriting: {}".
                        format(f["fn"], this_s_name))
                self.add_data_source(f, this_s_name, section="HsMetrics")
                self.picard_HsMetrics_data[this_s_name] = parsed_data[s_name][
                    j]

    # Filter to strip out ignored sample names
    self.picard_HsMetrics_data = self.ignore_samples(
        self.picard_HsMetrics_data)

    if len(self.picard_HsMetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_HsMetrics_data,
                             "multiqc_picard_HsMetrics")

        # Add to general stats table
        # Swap question marks with -1
        data = self.picard_HsMetrics_data
        for s_name in data:
            if data[s_name]["FOLD_ENRICHMENT"] == "?":
                data[s_name]["FOLD_ENRICHMENT"] = -1

        self.general_stats_headers["FOLD_ENRICHMENT"] = {
            "title": "Fold Enrichment",
            "min": 0,
            "format": "{:,.0f}",
            "scale": "Blues",
            "suffix": " X",
        }
        try:
            covs = config.picard_config["general_stats_target_coverage"]
            assert type(covs) == list
            assert len(covs) > 0
            covs = [str(i) for i in covs]
            log.debug("Custom Picard coverage thresholds: {}".format(", ".join(
                [i for i in covs])))
        except (AttributeError, TypeError, AssertionError):
            covs = ["30"]
        for c in covs:
            self.general_stats_headers["PCT_TARGET_BASES_{}X".format(c)] = {
                "id":
                "picard_target_bases_{}X".format(c),
                "title":
                "% Target Bases {}X".format(c),
                "description":
                "Percent of target bases with coverage &ge; {}X".format(c),
                "max":
                100,
                "min":
                0,
                "suffix":
                "%",
                "format":
                "{:,.0f}",
                "scale":
                "RdYlGn",
                "modify":
                lambda x: self.multiply_hundred(x),
            }
        for s_name in data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(data[s_name])
        self.add_section(name="HSMetrics",
                         anchor="picard_hsmetrics",
                         plot=table.plot(data, _get_table_headers(data)))
        tbases = _add_target_bases(data)
        self.add_section(name=tbases["name"],
                         anchor=tbases["anchor"],
                         description=tbases["description"],
                         plot=tbases["plot"])
        hs_pen_plot = hs_penalty_plot(data)
        if hs_pen_plot is not None:
            self.add_section(
                name="HS Penalty",
                anchor="picard_hsmetrics_hs_penalty",
                description=
                'The "hybrid selection penalty" incurred to get 80% of target bases to a given coverage.',
                helptext="""
                    Can be used with the following formula:

                    ```
                    required_aligned_bases = bait_size_bp * desired_coverage * hs_penalty
                    ```
                """,
                plot=hs_pen_plot,
            )

    # Return the number of detected samples to the parent module
    return len(self.picard_HsMetrics_data)
Ejemplo n.º 24
0
    def quast_table(self):
        """ Write some more statistics about the assemblies in a table. """

        headers = OrderedDict()
        headers['N50'] = {
            'title': 'N50 (Kbp)',
            'description': 'N50 is the contig length such that using longer or equal length contigs produces 50% of the bases of the assembly (kilo base pairs)',
            'min': 0,
            'suffix': 'bp',
            'scale': 'RdYlGn',
            'modify': lambda x: x / 1000
        }

        headers['N75'] = {
            'title': 'N75 (Kbp)',
            'description': 'N75 is the contig length such that using longer or equal length contigs produces 75% of the bases of the assembly (kilo base pairs)',
            'min': 0,
            'suffix': 'bp',
            'scale': 'RdYlGn',
            'modify': lambda x: x / 1000
        }
        headers['L50'] = {
            'title': 'L50 (k)',
            'description': 'L50 is the number of contigs larger than N50, i.e. the minimum number of contigs comprising 50% of the total assembly length.',
            'min': 0,
            'suffix': '',
            'scale': 'GnYlRd',
            'modify': lambda x: x / 1000
        }
        headers['L75'] = {
            'title': 'L75 (k)',
            'description': 'L75 is the number of contigs larger than N75, i.e. the minimum number of contigs comprising 75% of the total assembly length.',
            'min': 0,
            'suffix': '',
            'scale': 'GnYlRd',
            'modify': lambda x: x / 1000
        }
        headers['Largest contig'] = {
            'title': 'Largest contig (Kbp)',
            'description': 'The total number of bases in the assembly (mega base pairs).',
            'min': 0,
            'suffix': 'bp',
            'scale': 'YlGn',
            'modify': lambda x: x / 1000
        }
        headers['Total length'] = {
            'title': 'Length (Mbp)',
            'description': 'The total number of bases in the assembly (mega base pairs).',
            'min': 0,
            'suffix': 'bp',
            'scale': 'YlGn',
            'modify': lambda x: x / 1000000
        }
        headers['# misassemblies'] = {
            'title': 'Misassemblies',
            'description': 'The number of positions in the assembled contigs where the left flanking sequence aligns over 1 kbp away from the right flanking sequence on the reference (relocation) or they overlap on more than 1 kbp (relocation) or flanking sequences align on different strands (inversion) or different chromosomes (translocation).',
            'scale': 'RdYlGn-rev',
            'format': '{,:.0f}'
        }
        headers['# mismatches per 100 kbp'] = {
            'title': 'Mismatches/100kbp',
            'description': 'The number of mismatches per 100 kbp',
            'scale': 'YlOrRd',
            'format': '{:,.2f}',
        }
        headers['# indels per 100 kbp'] = {
            'title': 'Indels/100kbp',
            'description': 'The number of indels per 100 kbp',
            'scale': 'YlOrRd',
            'format': '{:,.2f}',
        }
        headers['# genes'] = {
            'title': 'Genes',
            'description': '# Genes',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['# genes_partial'] = {
            'title': 'Genes (Partial)',
            'description': '# Genes (Partial)',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['# predicted genes (unique)'] = {
            'title': 'Genes',
            'description': '# Predicted Genes (Unique)',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['Genome fraction (%)'] = {
            'title': 'Genome Fraction',
            'description': 'The total number of aligned bases in the reference, divided by the genome size.',
            'max': 100,
            'suffix': '%',
            'scale': 'YlGn'
        }
        config = {
            'id': 'quast_table',
            'namespace': 'QUAST',
            'min': 0,
        }
        return table.plot(self.quast_data, headers, config)
Ejemplo n.º 25
0
def parse_reports(self):
    """ Find Picard HsMetrics reports and parse their data """

    # Set up vars
    self.picard_HsMetrics_data = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('picard/hsmetrics', filehandles=True):
        parsed_data = dict()
        s_name = None
        keys = None
        commadecimal = None
        for l in f['f']:
            # New log starting
            if 'picard.analysis.directed.CalculateHsMetrics' in l or \
               'picard.analysis.directed.CollectHsMetrics' in l and 'INPUT' in l:
                s_name = None
                keys = None

                # Pull sample name from input
                fn_search = re.search(r"INPUT=(\[?[^\s]+\]?)", l)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1).strip('[]'))
                    s_name = self.clean_s_name(s_name, f['root'])
                    parsed_data[s_name] = dict()

            if s_name is not None:
                if 'picard.analysis.directed.HsMetrics' in l and '## METRICS CLASS' in l:
                    keys = f['f'].readline().strip("\n").split("\t")
                elif keys:
                    vals = l.strip("\n").split("\t")
                    if len(vals) == len(keys):
                        j = 'NA'
                        if keys[0] == 'BAIT_SET':
                            j = vals[0]
                        parsed_data[s_name][j] = dict()
                        # Check that we're not using commas for decimal places
                        if commadecimal is None:
                            for i, k in enumerate(keys):
                                if k.startswith('PCT_'):
                                    if ',' in vals[i]:
                                        commadecimal = True
                                    else:
                                        commadecimal = False
                        for i, k in enumerate(keys):
                            try:
                                if commadecimal:
                                    vals[i] = vals[i].replace('.', '')
                                    vals[i] = vals[i].replace(',', '.')
                                parsed_data[s_name][j][k] = float(vals[i])
                            except ValueError:
                                parsed_data[s_name][j][k] = vals[i]
                    else:
                        s_name = None
                        keys = None

        # Remove empty dictionaries
        for s_name in list(parsed_data.keys()):
            for j in parsed_data[s_name].keys():
                if len(parsed_data[s_name][j]) == 0:
                    parsed_data[s_name].pop(j, None)
            if len(parsed_data[s_name]) == 0:
                parsed_data.pop(s_name, None)

        # Manipulate sample names if multiple baits found
        for s_name in parsed_data.keys():
            for j in parsed_data[s_name].keys():
                this_s_name = s_name
                if(len(parsed_data[s_name]) > 1):
                    this_s_name = "{}: {}".format(s_name, j)
                if this_s_name in self.picard_HsMetrics_data:
                    log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f['fn'], this_s_name))
                self.add_data_source(f, this_s_name, section='HsMetrics')
                self.picard_HsMetrics_data[this_s_name] = parsed_data[s_name][j]


    # Filter to strip out ignored sample names
    self.picard_HsMetrics_data = self.ignore_samples(self.picard_HsMetrics_data)

    if len(self.picard_HsMetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_HsMetrics_data, 'multiqc_picard_HsMetrics')

        # Add to general stats table
        # Swap question marks with -1
        data = self.picard_HsMetrics_data
        for s_name in data:
            if data[s_name]['FOLD_ENRICHMENT'] == '?':
                data[s_name]['FOLD_ENRICHMENT'] = -1

        self.general_stats_headers['FOLD_ENRICHMENT'] = {
            'title': 'Fold Enrichment',
            'min': 0,
            'format': '{:,.0f}',
            'scale': 'Blues',
        }
        try:
            covs = config.picard_config['general_stats_target_coverage']
            assert type(covs) == list
            assert len(covs) > 0
            covs = [str(i) for i in covs]
            log.debug("Custom Picard coverage thresholds: {}".format(", ".join([i for i in covs])))
        except (AttributeError, TypeError, AssertionError):
            covs = ['30']
        for c in covs:
            self.general_stats_headers['PCT_TARGET_BASES_{}X'.format(c)] = {
                'id': 'picard_target_bases_{}X'.format(c),
                'title': 'Target Bases {}X'.format(c),
                'description': 'Percent of target bases with coverage &ge; {}X'.format(c),
                'max': 100,
                'min': 0,
                'suffix': '%',
                'format': '{:,.0f}',
                'scale': 'RdYlGn',
                'modify': lambda x: self.multiply_hundred(x)
            }
        for s_name in data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update( data[s_name] )
        data_table = _clean_table(data)
        self.add_section (
                name = 'HSMetrics',
                anchor = 'picard_hsmetrics',
                plot = table.plot(data_table, _get_headers(data_table))
        )
        tbases = _add_target_bases(data)
        self.add_section (
            name = tbases['name'],
            anchor = tbases['anchor'],
            description = tbases['description'],
            plot = tbases['plot']
        )
        hs_pen = _add_hs_penalty(data)
        if hs_pen is not None:
            self.add_section (
                name = hs_pen['name'],
                anchor = hs_pen['anchor'],
                description = hs_pen['description'],
                plot = hs_pen['plot']
            )

    # Return the number of detected samples to the parent module
    return len(self.picard_HsMetrics_data)
Ejemplo n.º 26
0
    def parse_plotCoverage(self):
        """Find plotCoverage output. Both stdout and --outRawCounts"""
        self.deeptools_plotCoverageStdout = dict()
        for f in self.find_log_files("deeptools/plotCoverageStdout"):
            parsed_data = self.parsePlotCoverageStdout(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotCoverageStdout:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotCoverageStdout[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section="plotCoverage")

        self.deeptools_plotCoverageOutRawCounts = dict()
        for f in self.find_log_files("deeptools/plotCoverageOutRawCounts"):
            parsed_data = self.parsePlotCoverageOutRawCounts(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotCoverageOutRawCounts:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotCoverageOutRawCounts[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section="plotCoverage")

        self.deeptools_plotCoverageStdout = self.ignore_samples(
            self.deeptools_plotCoverageStdout)
        self.deeptools_plotCoverageOutRawCounts = self.ignore_samples(
            self.deeptools_plotCoverageOutRawCounts)

        if len(self.deeptools_plotCoverageStdout) > 0:
            header = OrderedDict()
            header["min"] = {
                "title": "Min",
                "description": "Minimum Coverage",
                "shared_key": "coverage"
            }
            header["25%"] = {
                "rid": "first_quartile",
                "title": "1st Quartile",
                "description": "First quartile coverage",
                "shared_key": "coverage",
            }
            header["50%"] = {
                "rid": "median",
                "title": "Median",
                "description": "Median coverage (second quartile)",
                "shared_key": "coverage",
            }
            header["mean"] = {
                "title": "Mean",
                "description": "Mean coverage",
                "shared_key": "coverage"
            }
            header["75%"] = {
                "rid": "third_quartile",
                "title": "3rd Quartile",
                "description": "Third quartile coverage",
                "shared_key": "coverage",
            }
            header["max"] = {
                "title": "Max",
                "description": "Maximum coverage",
                "shared_key": "coverage"
            }
            header["std"] = {
                "title": "Std. Dev.",
                "description": "Coverage standard deviation",
                "shared_key": "coverage",
            }
            config = {"namespace": "deepTools plotCoverage"}
            self.add_section(
                name="Coverage metrics",
                anchor="deeptools_coverage_metrics",
                plot=table.plot(self.deeptools_plotCoverageStdout, header,
                                config),
            )

        if len(self.deeptools_plotCoverageOutRawCounts) > 0:
            config = {
                "id": "deeptools_coverage_metrics_plot",
                "title": "deepTools: Coverage distribution",
                "xlab": "Coverage",
                "ylab": "Fraction of bases sampled",
            }
            self.add_section(
                name="Coverage distribution",
                anchor="deeptools_coverage_distribution",
                description=
                "The fraction of bases with a given number of read/fragment coverage",
                plot=linegraph.plot(self.deeptools_plotCoverageOutRawCounts,
                                    config),
            )

        return len(self.deeptools_plotCoverageStdout), len(
            self.deeptools_plotCoverageOutRawCounts)
Ejemplo n.º 27
0
def parse_reports(self):
    """Find Picard CrosscheckFingerprints reports and parse their data.

    Stores the data in "Sample/Group - Sample/Group" groups since CrosscheckFingerprints
    does pairwise comparisons between samples at the level selected by `--CROSSCHECK_BY`.
    """

    self.picard_CrosscheckFingerprints_data = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files("picard/crosscheckfingerprints",
                                 filehandles=True):
        # Parse an individual CrosscheckFingerprints Report
        (metrics, comments) = _take_till(
            f["f"], lambda line: line.startswith("#") or line == "\n")
        header = next(metrics).rstrip("\n").split("\t")
        if not "LEFT_GROUP_VALUE" in header:
            # Not a CrosscheckFingerprints Report
            continue
        reader = DictReader(metrics, fieldnames=header, delimiter="\t")
        # Parse out the tumor awareness option and the lod threshold setting if possible
        (tumor_awareness, lod_threshold) = _parse_cli(comments[1])
        for i, row in enumerate(reader):
            # Check if this row contains samples that should be ignored
            if self.is_ignore_sample(
                    row["LEFT_SAMPLE"]) or self.is_ignore_sample(
                        row["RIGHT_SAMPLE"]):
                continue

            # Clean the sammple names
            row["LEFT_SAMPLE"] = self.clean_s_name(row["LEFT_SAMPLE"],
                                                   f["root"])
            row["LEFT_GROUP_VALUE"] = self.clean_s_name(
                row["LEFT_GROUP_VALUE"], f["root"])
            row["RIGHT_SAMPLE"] = self.clean_s_name(row["RIGHT_SAMPLE"],
                                                    f["root"])
            row["RIGHT_GROUP_VALUE"] = self.clean_s_name(
                row["RIGHT_GROUP_VALUE"], f["root"])

            # Set the cli options of interest for this file
            row["LOD_THRESHOLD"] = lod_threshold
            row["TUMOR_AWARENESS"] = tumor_awareness
            self.picard_CrosscheckFingerprints_data[i] = row

    # Only add sections if we found data
    if len(self.picard_CrosscheckFingerprints_data) > 0:
        # For each sample, flag if any comparisons that don't start with "Expected"
        # A sample that does not have all "Expected" will show as `False` and be Red
        general_stats_data = _create_general_stats_data(
            self.picard_CrosscheckFingerprints_data)
        general_stats_headers = {
            "Crosschecks All Expected": {
                "title":
                "Crosschecks",
                "description":
                "All results for samples CrosscheckFingerprints were as expected.",
            }
        }
        self.general_stats_addcols(general_stats_data, general_stats_headers)

        # Add a table section to the report
        self.add_section(
            name="Crosscheck Fingerprints",
            anchor="picard-crosscheckfingerprints",
            description="Pairwise identity checking betwen samples and groups.",
            helptext="""
            Checks that all data in the set of input files comes from the same individual, based on the selected group granularity.
            """,
            plot=table.plot(
                self.picard_CrosscheckFingerprints_data,
                _get_table_headers(self.picard_CrosscheckFingerprints_data),
                {
                    "namespace": "Picard",
                    "id": "picard_crosscheckfingerprints_table",
                    "table_title": "Picard: Crosscheck Fingerprints",
                    "save_file": True,
                    "col1_header": "ID",
                    "no_beeswarm": True,
                },
            ),
        )

    return len(self.picard_CrosscheckFingerprints_data)
Ejemplo n.º 28
0
    def chart_read_avg_retention_rate(self):
        '''
        Charts _totalReadConversionRate.txt
               _totalBaseConversionRate.txt
        Inputs:
            No inputs
        Returns:
            No returns, generates Retenion vs. Base Position in Read chart
        '''

        mdata_byread = {}
        for s_name, dd in self.mdata['read_avg_retention_rate'].items():
            mdata_byread[s_name] = dd

        mdata_bybase = {}
        for s_name, dd in self.mdata['base_avg_retention_rate'].items():
            mdata_bybase[s_name] = dd

        pdata = {}
        for s_name, dd in mdata_byread.items():
            try:
                pdata[s_name] = dict(
                    list(dd.items()) + list(mdata_bybase[s_name].items()))
            except KeyError:
                log.warning(
                    "Couldn't find sample when making avg_retention_rate plot: '{}'"
                    .format(s_name))

        shared = {
            'format': '{:,.2f}',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'YlGnBu'
        }

        pheader = OrderedDict()
        pheader['rca'] = dict(
            shared, **{
                'title': 'RA CpA',
                'description': 'Read Averaged CpA Retention'
            })
        pheader['rcc'] = dict(
            shared, **{
                'title': 'RA CpC',
                'description': 'Read Averaged CpC Retention'
            })
        pheader['rcg'] = dict(
            shared, **{
                'title': 'RA CpG',
                'description': 'Read Averaged CpG Retention'
            })
        pheader['rct'] = dict(
            shared, **{
                'title': 'RA CpT',
                'description': 'Read Averaged CpT Retention'
            })
        pheader['bca'] = dict(
            shared, **{
                'title': 'BA CpA',
                'description': 'Base Averaged CpA Retention'
            })
        pheader['bcc'] = dict(
            shared, **{
                'title': 'BA CpC',
                'description': 'Base Averaged CpC Retention'
            })
        pheader['bcg'] = dict(
            shared, **{
                'title': 'BA CpG',
                'description': 'Base Averaged CpG Retention'
            })
        pheader['bct'] = dict(
            shared, **{
                'title': 'BA CpT',
                'description': 'Base Averaged CpT Retention'
            })

        pconfig = {
            'id': 'biscuit_retention',
            'table_title': 'BISCUIT: Cytosine Retention',
            'sortRows': False,
            'save_file': True
        }

        self.add_section(name='Cytosine Retention',
                         anchor='biscuit-retention',
                         description='''
                Shows the cytosine retention rate for different contexts.
                `RA`: Read-averaged rates.
                `BA`:Base-averaged rates.
            ''',
                         helptext='''
                The cytosine retention rate is calculated as `1 - (cytosine conversion rate)`.

                Assuming complete, but not over, bisulfite conversion, the cytosine retention rate
                is the average cytosine modification (including 5mC, 5hmC, etc) rate.
            ''',
                         plot=table.plot(pdata, pheader, pconfig))
Ejemplo n.º 29
0
    def table(self, json, overall_pe, overall_se, zeroes, index):

        # Table construction. Taken from MultiQC docs.

        # If no PE and SE removed, return nothin
        if (overall_pe + overall_se) == 0:
            return ""

        headers = OrderedDict()

        # IF values sufficiently small, use raw values
        if zeroes == False:
            headers["Nt_%_BP_Lost" + index] = {
                "title": "% Bp Lost",
                "namespace": "% Bp Lost",
                "description": "Percentage of Input bps (SE and PE) trimmed.",
                "suffix": "%",
                "format": "{:,.2f}",
                "scale": "Greens",
            }
        else:
            headers["Nt_BP_Lost" + index] = {
                "title": "Total Bp Lost",
                "namespace": "Total Bp Lost",
                "description": "Total input bps (SE and PE) trimmed.",
                "format": "{:,.0f}",
                "scale": "Greens",
            }

        # IF PE data, add columns
        if overall_pe != 0:
            headers["Nt_%_R1_BP_Lost" + index] = {
                "title": "% R1 of Bp Lost",
                "namespace": "% Bp Lost from R1",
                "description": "Percentage of total trimmed bps.",
                "suffix": "%",
                "format": "{:,.2f}",
                "scale": "RdPu",
            }
            headers["Nt_%_R2_BP_Lost" + index] = {
                "title": "% R2 of Bp Lost",
                "namespace": "% Bp Lost from R2",
                "description": "Percentage of total trimmed bps.",
                "suffix": "%",
                "format": "{:,.2f}",
                "scale": "Greens",
            }

        # If SE data, add columns
        if overall_se != 0:
            headers["Nt_%_SE_BP_Lost" + index] = {
                "title": "% SE of Bp Lost",
                "namespace": "% Bp Lost from SE",
                "description": "Percentage of total trimmed bps.",
                "suffix": "%",
                "format": "{:,.2f}",
                "scale": "RdPu",
            }

        # IF data is large enough, include avg.
        if zeroes == False:
            headers["Nt_Avg_BP_Trimmed" + index] = {
                "title": "Avg. Bps Trimmed",
                "namespace": "Avg. Bps Trimmed",
                "description": "Average Number of Basepairs Trimmed per Read",
                "format": "{:,.2f}",
                "scale": "Blues",
            }

        headers["Nt_%_Discarded" + index] = {
            "title": "% Discarded",
            "namespace": "% Discarded",
            "description": "Percentage of Reads (SE and PE) Discarded",
            "suffix": "%",
            "max": 100,
            "format": "{:,.2f}",
            "scale": "Oranges",
        }

        headers["Nt_Notes" + index] = {"title": "Notes", "namespace": "Notes", "description": "Notes"}

        return table.plot(json, headers)
Ejemplo n.º 30
0
    def sample_stats_table(self):
        sample_stats_data = dict()
        total_reads = self._total_reads_all_runs()

        for sample_id, sample in self.bclconvert_bysample.items():
            # percent stats for bclconvert-bysample i.e. stats for sample across all lanes
            try:
                perfect_percent = "{0:.1f}".format(
                    float(100.0 * sample["perfect_index_reads"] /
                          sample["reads"]))
            except ZeroDivisionError:
                perfect_percent = "0.0"
            try:
                one_mismatch_pecent = "{0:.1f}".format(
                    float(100.0 * sample["one_mismatch_index_reads"] /
                          sample["reads"]))
            except ZeroDivisionError:
                one_mismatch_pecent = "0.0"

            try:
                yield_q30_percent = "{0:.1f}".format(
                    float(100.0 * (sample["basesQ30"] / sample["yield"])))
            except ZeroDivisionError:
                yield_q30_percent = "0.0"  #

            try:
                percent_yield = (float(sample["yield"]) / float(
                    (total_reads) * (self.cluster_length))) * 100.0
            except ZeroDivisionError:
                percent_yield = "NA"

            try:
                percent_reads = (float(sample["reads"]) /
                                 float(total_reads)) * 100.0
            except ZeroDivisionError:
                percent_reads = "NA"

            sample_stats_data[sample_id] = {
                "depth": sample["depth"],
                "basesQ30": sample["basesQ30"],
                "reads": sample["reads"],
                "percent_reads": percent_reads,
                "yield": sample["yield"],
                "percent_yield": percent_yield,
                "yield_q30_percent": yield_q30_percent,
                # "perfect_index": samle['perfect_index_reads'], # don't need these
                # "one_mismatch_index_reads": sample['one_mismatch_index_reads'],
                "perfect_pecent": perfect_percent,
                "one_mismatch_pecent": one_mismatch_pecent,
            }

        headers = OrderedDict()
        if sample["depth"] != "NA":
            headers["depth"] = {
                "title":
                "Coverage",
                "description":
                ("Estimated sequencing depth based on the number of bases with quality score greater or equal to Q30, "
                 "assuming the genome size is {}, as provided in config".
                 format(self._get_genome_size())),
                "min":
                0,
                "suffix":
                "X",
                "scale":
                "BuPu",
            }

        headers["reads"] = {
            "title":
            "{} Clusters".format(config.read_count_prefix),
            "description":
            "Total number of clusters (read pairs) for this sample as determined by bclconvert demultiplexing ({})"
            .format(config.read_count_desc),
            "scale":
            "Blues",
            "shared_key":
            "read_count",
        }
        headers["yield"] = {
            "title":
            "Yield ({})".format(config.base_count_prefix),
            "description":
            "Total number of bases for this sample as determined by bclconvert demultiplexing ({})"
            .format(config.base_count_desc),
            "scale":
            "Greens",
            "shared_key":
            "base_count",
        }
        headers["percent_reads"] = {
            "title": "% Clusters",
            "description":
            "Percentage of clusters (read pairs) for this sample in this run, as determined by bclconvert demultiplexing",
            "scale": "Blues",
            "max": 100,
            "min": 0,
            "suffix": "%",
        }
        headers["percent_yield"] = {
            "title": "% Yield",
            "description":
            "Percentage of sequenced bases for this sample in this run",
            "scale": "Greens",
            "max": 100,
            "min": 0,
            "suffix": "%",
        }
        headers["basesQ30"] = {
            "title":
            "Bases ({}) &ge; Q30 (PF)".format(config.base_count_prefix),
            "description":
            "Number of bases with a Phred score of 30 or higher, passing filter ({})"
            .format(config.base_count_desc),
            "scale":
            "Blues",
            "shared_key":
            "base_count",
        }
        headers["yield_q30_percent"] = {
            "title":
            "% Bases &ge; Q30 (PF)",
            "description":
            "Percent of bases with a Phred score of 30 or higher, passing filter ({})"
            .format(config.base_count_desc),
            "scale":
            "Greens",
            "max":
            100,
            "min":
            0,
            "suffix":
            "%",
        }
        headers["perfect_pecent"] = {
            "title": "% Perfect Index",
            "description":
            "Percent of reads with perfect index (0 mismatches)",
            "max": 100,
            "min": 0,
            "scale": "RdYlGn",
            "suffix": "%",
        }
        headers["one_mismatch_pecent"] = {
            "title": "% One Mismatch Index",
            "description": "Percent of reads with one mismatch index",
            "max": 100,
            "min": 0,
            "scale": "RdYlGn",
            "suffix": "%",
        }

        # Table config
        table_config = {
            "namespace": "bclconvert",
            "id": "bclconvert-sample-stats-table",
            "table_title": "bclconvert Sample Statistics",
            "no_beeswarm": True,
        }

        return table.plot(sample_stats_data, headers, table_config)
Ejemplo n.º 31
0
    def lane_stats_table(self):
        for lane_id, lane in self.bclconvert_bylane.items():
            try:
                yield_q30_percent = "{0:.1f}".format(
                    float(100.0 * (lane["basesQ30"] / lane["yield"])))
            except ZeroDivisionError:
                yield_q30_percent = "0.0"
            self.bclconvert_bylane[lane_id][
                "yield_q30_percent"] = yield_q30_percent

        headers = OrderedDict()
        if lane["depth"] != "NA":
            headers["depth-lane"] = {
                "title":
                "Coverage",
                "description":
                ("Estimated sequencing depth based on the number of bases with quality score greater or equal to Q30, "
                 "assuming the genome size is {}, as provided in config".
                 format(self._get_genome_size())),
                "suffix":
                "X",
                "scale":
                "BuPu",
            }

        headers["reads-lane"] = {
            "title":
            "{} Clusters".format(config.read_count_prefix),
            "description":
            "Total number of clusters (read pairs) for this sample as determined by bclconvert demultiplexing ({})"
            .format(config.read_count_desc),
            "scale":
            "Blues",
            "shared_key":
            "read_count",
        }
        headers["yield-lane"] = {
            "title":
            "Yield ({})".format(config.base_count_prefix),
            "description":
            "Total number of bases for this sample as determined by bclconvert demultiplexing ({})"
            .format(config.base_count_desc),
            "scale":
            "Greens",
            "shared_key":
            "base_count",
        }
        headers["basesQ30-lane"] = {
            "title":
            "Bases ({}) &ge; Q30 (PF)".format(config.base_count_prefix),
            "description":
            "Number of bases with a Phred score of 30 or higher, passing filter ({})"
            .format(config.base_count_desc),
            "scale":
            "Blues",
            "shared_key":
            "base_count",
        }
        headers["yield_q30_percent-lane"] = {
            "title": "% Bases &ge; Q30 (PF)",
            "description":
            "Percent of bases with a Phred score of 30 or higher, passing filter",
            "max": 100,
            "min": 0,
            "scale": "Greens",
        }
        headers["perfect_index_reads-lane"] = {
            "title":
            "{} Perfect Index".format(config.read_count_prefix),
            "description":
            "Reads with perfect index - 0 mismatches ({})".format(
                config.read_count_desc),
            "scale":
            "Blues",
            "shared_key":
            "read_count",
        }

        headers["one_mismatch_index_reads-lane"] = {
            "title":
            "{} One Mismatch".format(config.read_count_prefix),
            "description":
            "Reads with one mismatch index ({})".format(
                config.read_count_desc),
            "scale":
            "Spectral",
            "shared_key":
            "read_count",
        }
        headers["percent_perfectIndex-lane"] = {
            "title": "% Perfect Index",
            "description":
            "Percent of reads with perfect index - 0 mismatches",
            "max": 100,
            "min": 0,
            "scale": "RdYlGn",
            "suffix": "%",
        }
        headers["percent_oneMismatch-lane"] = {
            "title": "% One Mismatch",
            "description": "Percent of reads with one mismatch",
            "max": 100,
            "min": 0,
            "scale": "RdYlGn",
            "suffix": "%",
        }

        # Table config
        table_config = {
            "namespace": "bclconvert-lane",
            "id": "bclconvert-lane-stats-table",
            "table_title": "bclconvert Lane Statistics",
            "col1_header": "Run ID - Lane",
            "no_beeswarm": True,
        }

        # new dict with matching keys for plotting (this avoids duplicate html id linting errors)
        bclconvert_bylane_foroutput = dict()
        for laneid, lanestats in self.bclconvert_bylane.items():
            if laneid not in bclconvert_bylane_foroutput:
                bclconvert_bylane_foroutput[laneid] = dict()
            for key, value in lanestats.items():
                bclconvert_bylane_foroutput[laneid][key + "-lane"] = value

        return table.plot(bclconvert_bylane_foroutput, headers, table_config)
Ejemplo n.º 32
0
    def linegraph(self, json, index):

        # plot configurations, list of options in MultiQC docs
        config = {
            "id": "htstream_superdedup_" + index,
            "title": "HTStream: Duplicate Saturation",
            "xlab": "Total Reads",
            "ylab": "Unique Reads",
        }

        # initialize data structures and variabe;s
        data = {}
        invariant_saturation_dict = {}
        html = "<h4> SuperDeduper: Duplicate Saturation </h4>\n"
        html += "<p>Plots the number of duplicates against the number of unique reads per sample.</p>"

        for key in json.keys():

            # if duplicate saturation histogram has data point, it is added to 'invariant_saturation_dict', where
            # 	it will be represented as table instead of a hideous graph.
            if len(json[key]["Sd_Saturation"]) == 1:
                invariant_saturation_dict[key] = {
                    "Sd_Total_Reads": json[key]["Sd_Saturation"][0][0],
                    "Sd_Duplicates": json[key]["Sd_Saturation"][0][1],
                }

            # if more than one data point is identified (low bar, I know), it will be added to the graph's data
            # 	dictionary. Data points represented as dictionary: {x: y}.
            else:
                data[key] = {}

                for item in json[key]["Sd_Saturation"]:

                    data[key][item[0]] = item[0] - item[1]

        # checks for any invariant samples and creates an alert div and table to  hold the data.
        if len(invariant_saturation_dict.keys()) != 0:

            # table
            headers = OrderedDict()
            headers["Sd_Total_Reads"] = {
                "title": "Total Reads",
                "namespace": "Total Reads",
                "description": "Number of Total Reads",
                "format": "{:,.0f}",
                "scale": "Greens",
            }
            headers["Sd_Duplicates"] = {
                "title": "Total Reads - Duplicates",
                "namespace": "Duplicates",
                "description": "Number of Duplicates",
                "format": "{:,.0f}",
                "scale": "RdPu",
            }

            # add to output html
            notice = (
                "<strong>Notice:</strong> Samples with uniform duplication numbers identified (displayed below). <br />"
            )
            html += '<div class="alert alert-info">{n}</div>'.format(n=notice)
            html += table.plot(invariant_saturation_dict, headers)

        # creates line graph only if samples with more than one data point are presents.
        if data != {}:
            html += linegraph.plot(data, config)

        return html
Ejemplo n.º 33
0
  def detail_1(self, id, data, title='Detailed numbers of performance assessment based on reference datasets', section_name='Detailed numbers of performance assessment based on reference datasets', description="", helptext=None):
    """ Create the HTML for detailed numbers of performance assessment based on reference datasets """
    headers = OrderedDict()
    headers['SNV number'] = {
      'title': 'SNV Num',
      'description': 'SNV Total Number',
      'scale': False
    }

    headers['SNV precision'] = {
      'title': 'SNV precision',
      'description': 'SNV Precision',
      'scale': False,
      'format': '{:.2f}'
    }

    headers['SNV recall'] = {
      'title': 'SNV recall',
      'description': 'SNV Recall',
      'scale': False,
      'format': '{:.2f}'
    }

    headers['SNV F1-score'] = {
      'title': 'SNV F1-score',
      'description': 'SNV F1-score',
      'scale': False,
      'format': '{:.2f}'
    }

    headers['INDEL number'] = {
      'title': 'INDEL Num',
      'description': 'INDEL Total Number',
      'scale': False
    }
    
    headers['INDEL precision'] = {
      'title': 'INDEL precision',
      'description': 'INDEL Precision',
      'scale': False,
      'format': '{:.2f}'
    }

    headers['INDEL recall'] = {
      'title': 'INDEL recall',
      'description': 'INDEL Recall',
      'scale': False,
      'format': '{:.2f}'
    }

    headers['INDEL F1-score'] = {
      'title': 'INDEL F1-score',
      'description': 'INDEL F1-score',
      'scale': False,
      'format': '{:.2f}'
    }

    table_config = {
      'namespace': 'variant_calling_qc_details',
      'id': id,
      'table_title': 'Detailed numbers of performance assessment based on reference datasets',
      'col1_header': 'Sample',
      'no_beeswarm': False,
      'sortRows': False,
      'format': '{:.0f}',
      'max_table_rows': 20,
      'decimalPoint_format': ',',
      'thousandsSep_format': ",",
      'save_file': True
    }

    # Add a report section with the table
    self.add_section(
      name = section_name if section_name else '',
      anchor = id + '_anchor',
      description = description if description else '',
      plot = table.plot(data, headers, table_config)
    )
Ejemplo n.º 34
0
    def parse_bcftools_stats(self):
        """
        Find bcftools stats logs and parse their data
          Bcftools stats reports contain 'sets' of data, which can
          have multiple vcf files each (but usually don't). Here,
          we treat each 'set' as a MultiQC sample, taking the first
          input filename for each set as the name.
        """
        collapse_complementary = getattr(config, "bcftools", {}).get(
            "collapse_complementary_changes", False)
        if collapse_complementary:
            types = ["A>C", "A>G", "A>T", "C>A", "C>G", "C>T"]
        else:
            types = [
                "A>C", "A>G", "A>T", "C>A", "C>G", "C>T", "G>A", "G>C", "G>T",
                "T>A", "T>C", "T>G"
            ]

        self.bcftools_stats = dict()
        self.bcftools_stats_indels = dict()
        self.bcftools_stats_vqc_snp = dict()
        self.bcftools_stats_vqc_transi = dict()
        self.bcftools_stats_vqc_transv = dict()
        self.bcftools_stats_vqc_indels = dict()
        depth_data = dict()
        for f in self.find_log_files("bcftools/stats"):
            s_names = list()
            for line in f["f"].splitlines():
                s = line.split("\t")
                # Get the sample names - one per 'set'
                if s[0] == "ID":
                    s_name = self.clean_s_name(s[2], f["root"])
                    s_names.append(s_name)
                    if s_name in self.bcftools_stats:
                        log.debug(
                            "Duplicate sample name found! Overwriting: {}".
                            format(s_name))
                    self.add_data_source(f, s_name, section="stats")
                    self.bcftools_stats[s_name] = dict()
                    self.bcftools_stats_indels[s_name] = dict()
                    self.bcftools_stats_vqc_snp[s_name] = dict()
                    self.bcftools_stats_vqc_transi[s_name] = dict()
                    self.bcftools_stats_vqc_transv[s_name] = dict()
                    self.bcftools_stats_vqc_indels[s_name] = dict()
                    depth_data[s_name] = OrderedDict()
                    self.bcftools_stats_indels[s_name][
                        0] = None  # Avoid joining line across missing 0

                # Parse key stats
                if s[0] == "SN" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    field = s[2].strip()[:-1]
                    field = field.replace(" ", "_")
                    value = float(s[3].strip())
                    self.bcftools_stats[s_name][field] = value

                # Parse transitions/transversions stats
                if s[0] == "TSTV" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = [
                        "ts", "tv", "tstv", "ts_1st_ALT", "tv_1st_ALT",
                        "tstv_1st_ALT"
                    ]
                    for i, f in enumerate(fields):
                        value = float(s[i + 2].strip())

                        self.bcftools_stats[s_name][f] = value

                # Parse substitution types
                if s[0] == "ST" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]

                    rc = {"A": "T", "C": "G", "G": "C", "T": "A"}
                    change = s[2].strip()
                    if change not in types:
                        change = ">".join(rc[n] for n in change.split(">"))

                    field = "substitution_type_{}".format(change)
                    value = float(s[3].strip())
                    if field not in self.bcftools_stats[s_name]:
                        self.bcftools_stats[s_name][field] = 0
                    self.bcftools_stats[s_name][field] += value

                # Indel length distributions
                if s[0] == "IDD" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    length = float(s[2].strip())
                    count = float(s[3].strip())
                    self.bcftools_stats_indels[s_name][length] = count

                # Per-sample counts
                if s[0] == "PSC" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = ["variations_hom", "variations_het"]
                    for i, f in enumerate(fields):
                        self.bcftools_stats[s_name][f] = int(s[i + 4].strip())

                # Depth plots
                if s[0] == "DP" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    bin_name = s[2].strip()
                    percent_sites = float(s[-1].strip())
                    depth_data[s_name][bin_name] = percent_sites

                # Variant Qualities
                if s[0] == "QUAL" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    quality = float(s[2].strip())
                    self.bcftools_stats_vqc_snp[s_name][quality] = float(
                        s[3].strip())
                    self.bcftools_stats_vqc_transi[s_name][quality] = float(
                        s[4].strip())
                    self.bcftools_stats_vqc_transv[s_name][quality] = float(
                        s[5].strip())
                    self.bcftools_stats_vqc_indels[s_name][quality] = float(
                        s[6].strip())

        # Filter to strip out ignored sample names
        self.bcftools_stats = self.ignore_samples(self.bcftools_stats)

        if len(self.bcftools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.bcftools_stats, "multiqc_bcftools_stats")

            # Stats Table
            stats_headers = self.bcftools_stats_genstats_headers()
            if getattr(config, "bcftools", {}).get("write_general_stats",
                                                   True):
                self.general_stats_addcols(self.bcftools_stats, stats_headers,
                                           "Bcftools Stats")
            if getattr(config, "bcftools", {}).get("write_separate_table",
                                                   False):
                self.add_section(name="Bcftools Stats",
                                 anchor="bcftools-stats",
                                 plot=table.plot(self.bcftools_stats,
                                                 stats_headers))

            # Make bargraph plot of substitution types
            keys = OrderedDict()
            for t in types:
                keys["substitution_type_{}".format(t)] = {"name": t}
            pconfig = {
                "id": "bcftools-stats-subtypes",
                "title": "Bcftools Stats: Substitutions",
                "ylab": "# Substitutions",
                "cpswitch_counts_label": "Number of Substitutions",
            }
            self.add_section(
                name="Variant Substitution Types",
                anchor="bcftools-stats",
                plot=bargraph.plot(self.bcftools_stats, keys, pconfig),
            )

            # Make histograms of variant quality
            if len(self.bcftools_stats_vqc_snp) > 0:
                pconfig = {
                    "id":
                    "bcftools_stats_vqc",
                    "title":
                    "Bcftools Stats: Variant Quality Count",
                    "ylab":
                    "Count",
                    "xlab":
                    "Quality",
                    "xDecimals":
                    False,
                    "ymin":
                    0,
                    "smooth_points":
                    600,
                    # 'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
                    "data_labels": [
                        {
                            "name": "Count SNP",
                            "ylab": "Quality"
                        },
                        {
                            "name": "Count Transitions",
                            "ylab": "Quality"
                        },
                        {
                            "name": "Count Transversions",
                            "ylab": "Quality"
                        },
                        {
                            "name": "Count Indels",
                            "ylab": "Quality"
                        },
                    ],
                }
                self.add_section(
                    name="Variant Quality",
                    anchor="bcftools-stats_variant_quality_plot",
                    plot=linegraph.plot(
                        [
                            self.bcftools_stats_vqc_snp,
                            self.bcftools_stats_vqc_transi,
                            self.bcftools_stats_vqc_transv,
                            self.bcftools_stats_vqc_indels,
                        ],
                        pconfig,
                    ),
                )

            # Make line graph of indel lengths
            if len(self.bcftools_stats_indels) > 0:
                pconfig = {
                    "id": "bcftools_stats_indel-lengths",
                    "title": "Bcftools Stats: Indel Distribution",
                    "ylab": "Count",
                    "xlab": "InDel Length (bp)",
                    "xDecimals": False,
                    "ymin": 0,
                }
                self.add_section(
                    name="Indel Distribution",
                    anchor="bcftools-stats_indel_plot",
                    plot=linegraph.plot(self.bcftools_stats_indels, pconfig),
                )
            # Make line graph of variants per depth
            if len(depth_data) > 0:
                pconfig = {
                    "id": "bcftools_stats_depth",
                    "title": "Bcftools Stats: Variant depths",
                    "ylab": "Fraction of sites (%)",
                    "xlab": "Variant depth",
                    "ymin": 0,
                    "ymax": 100,
                    "categories": True,
                }
                self.add_section(
                    name="Variant depths",
                    anchor="bcftools-stats_depth_plot",
                    description=
                    "Read depth support distribution for called variants",
                    plot=linegraph.plot(depth_data, pconfig),
                )

        # Return the number of logs that were found
        return len(self.bcftools_stats)
Ejemplo n.º 35
0
    def heatmap(self):
        """
        Generates Heatmap for samples considering all features
        Computes the Jensen Shannon Divergence between all samples
        Value 0 corresponds to similar samples and 1 to dissimilar samples
        Output : Plots heatmap and table showing presence of missing features in samples
        """
        names = []
        gc_names = []
        seq_names = []
        missing_names = {}
        gc_exists = {}
        seq_exists = {}
        for f in self.find_log_files('salmon/fld'):
            if os.path.basename(f['root']) == 'libParams':
                s_name = os.path.abspath(f['root'])
                path = s_name[:-10]
                sample_name = self.get_sample_name(s_name)

                if 'no_bias' in s_name:
                    continue
                path_meta_info = os.path.join(path, 'aux_info', 'meta_info.json')
                with open(path_meta_info, 'r') as info:
                    meta_info = json.load(info)

                    gc_exists[sample_name] = meta_info['gc_bias_correct']
                    seq_exists[sample_name] = meta_info['seq_bias_correct']

                if gc_exists[sample_name]:
                    gc_names.append(sample_name)
                    if sample_name not in missing_names:
                        missing_names[sample_name] = {}
                    missing_names[sample_name]['Missing GC Feature'] = 'No'
                else :
                    if sample_name not in missing_names:
                        missing_names[sample_name] = {}
                    missing_names[sample_name]['Missing GC Feature'] = 'Yes'
                if seq_exists[sample_name]:
                    seq_names.append(sample_name)
                    if sample_name not in missing_names:
                        missing_names[sample_name] = {}
                    missing_names[sample_name]['Missing Seq Feature'] = 'No'
                else:
                    if sample_name not in missing_names:
                        missing_names[sample_name] = {}
                    missing_names[sample_name]['Missing Seq Feature'] = 'Yes'
                if gc_exists[sample_name] and seq_exists[sample_name]:
                    names.append(sample_name)

        sims_gc = [[0 for j in range(len(gc_names))] for i in range(len(gc_names))]
        sims_3 = [[0 for j in range(len(seq_names))] for i in range(len(seq_names))]
        sims_5 = [[0 for j in range(len(seq_names))] for i in range(len(seq_names))]
        sims = [[0 for j in range(len(names))] for i in range(len(names))]

        for i in range(len(names)):
            for j in range(len(names)):
                feature_count = 0
                if gc_exists[names[i]] and gc_exists[names[j]]:
                    sims[i][j] += self.jensen_shannon_divergence(self.matrix_gc[names[i]], self.matrix_gc[names[j]])
                    feature_count += 1.0
                for k in range(len(self.nucleotides)):
                    if seq_exists[names[i]] and seq_exists[names[j]]:
                        sims[i][j] += self.jensen_shannon_divergence(self.matrix_seq3[k][names[i]], self.matrix_seq3[k][names[j]])
                        sims[i][j] += self.jensen_shannon_divergence(self.matrix_seq5[k][names[i]], self.matrix_seq5[k][names[j]])
                        feature_count += 2.0

                sims[i][j] /= feature_count
        for i in range(len(gc_names)):
            for j in range(len(gc_names)):
                if gc_exists[gc_names[i]] and gc_exists[gc_names[j]]:
                    sims_gc[i][j] += self.jensen_shannon_divergence(self.matrix_gc[gc_names[i]], self.matrix_gc[gc_names[j]])

        for i in range(len(seq_names)):
            for j in range(len(seq_names)):
                for k in range(len(self.nucleotides)):
                    if seq_exists[seq_names[i]] and seq_exists[seq_names[j]]:
                        sims_3[i][j] += self.jensen_shannon_divergence(self.matrix_seq3[k][seq_names[i]], self.matrix_seq3[k][seq_names[j]])
                        sims_5[i][j] += self.jensen_shannon_divergence(self.matrix_seq5[k][seq_names[i]], self.matrix_seq5[k][seq_names[j]])
                sims_3[i][j] /= (1.0*len(self.nucleotides))
                sims_5[i][j] /= (1.0*len(self.nucleotides))
        pconfig_sim = {
            'title': 'Sample similarity (JSD)',
            'xTitle': 'Samples',
            'yTitle': 'Samples',
        }
        pconfig_sim_gc = {
            'title': 'Feature GC Sample similarity (JSD)',
            'xTitle': 'Samples',
            'yTitle': 'Samples',
        }
        pconfig_sim_3 = {
            'title': 'Feature Seq 3 Sample similarity (JSD)',
            'xTitle': 'Samples',
            'yTitle': 'Samples',
        }
        pconfig_sim_5 = {
            'title': 'Feature Seq 5 Sample similarity (JSD)',
            'xTitle': 'Samples',
            'yTitle': 'Samples',
        }

        if len(gc_exists) > 0:
            self.add_section(plot = heatmap.plot(sims_gc, gc_names, pconfig=pconfig_sim_gc))
        if len(seq_exists) > 0:
            self.add_section(plot = heatmap.plot(sims_3, seq_names, pconfig=pconfig_sim_3))
        if len(seq_exists) > 0:
            self.add_section(plot = heatmap.plot(sims_5, seq_names, pconfig=pconfig_sim_5))
        if len(names) > 0:
            self.add_section(plot = heatmap.plot(sims, names, pconfig=pconfig_sim))

        self.add_section(plot = table.plot(missing_names))
Ejemplo n.º 36
0
	def verifybamid_table(self):
		"""
		Create a table with all the columns from verify BAM ID
		"""

		# create an ordered dictionary to preserve the order of columns
		headers = OrderedDict()
		# add each column and the title and description (taken from verifyBAMID website)
		headers['RG'] = {
			'title': 'Read Group',
			'description': 'ReadGroup ID of sequenced lane.',
			'hidden': all( [ s['RG'] == 'ALL' for s in self.verifybamid_data.values() ] )
		}
		if not self.hide_chip_columns:
			headers['CHIP_ID'] = {
				'title': 'Chip ID',
				'description': 'ReadGroup ID of sequenced lane.'
			}
		headers['#SNPS'] = {
			'title': 'SNPS',
			'description': '# SNPs passing the criteria from the VCF file',
			'format': '{:,.0f}',
			'min': 0,
			'scale': 'BuPu'
		}
		headers['#READS'] = {
			'title': '{} Reads'.format(config.read_count_prefix),
			'description': 'Number of reads loaded from the BAM file ({})'.format(config.read_count_desc),
			'format': '{:,.1f}',
			'modify': lambda x: x * config.read_count_multiplier if x != "NA" else x,
			'shared_key': 'read_count',
			'min': 0,
			'scale': 'GnBu'
		}
		headers['AVG_DP'] = {
			'title': 'Average Depth',
			'description': 'Average sequencing depth at the sites in the VCF file',
			'suffix': ' X',
			'min': 0,
			'scale': 'YlGn'
		}
		# use default columns
		headers['FREEMIX'] = dict(self.col_config_defaults, **{
			'title': 'Contamination (Seq)',
			'description': 'VerifyBamID: FREEMIX -   Sequence-only estimate of contamination.',
		})
		headers['FREELK1'] = {
			'title': 'FREEELK1',
			'format': '{:,.0f}',
			'description': 'Maximum log-likelihood of the sequence reads given estimated contamination under sequence-only method',
			'min': 0,
			'scale': 'RdYlGn'
		}
		headers['FREELK0'] = {
			'title': 'FREELK0',
			'format': '{:,.0f}',
			'description': 'Log-likelihood of the sequence reads given no contamination under sequence-only method',
			'min': 0,
			'scale': 'RdYlGn'
		}
		headers['FREE_RH'] = {
			'title': 'FREE_RH',
			'description': 'Estimated reference bias parameter Pr(refBase|HET) (when --free-refBias or --free-full is used)',
			'hidden': all( [ s['FREE_RH'] == 'NA' for s in self.verifybamid_data.values() ] ),
		}
		headers['FREE_RA'] = {
			'title': 'FREE_RA',
			'description': 'Estimated reference bias parameter Pr(refBase|HOMALT) (when --free-refBias or --free-full is used)',
			'hidden': all( [ s['FREE_RA'] == 'NA' for s in self.verifybamid_data.values() ] ),
		}

		# Only print Chip columns to the report if we have data
		if not self.hide_chip_columns:
			headers['CHIPMIX'] = dict(self.col_config_defaults, **{
				'title': 'Contamination S+A',
				'description': 'VerifyBamID: CHIPMIX -   Sequence+array estimate of contamination (NA if the external genotype is unavailable)'
			})
			headers['CHIPLK1'] = {
				'title': 'CHIPLK1',
				'description': 'Maximum log-likelihood of the sequence reads given estimated contamination under sequence+array method (NA if the external genotypes are unavailable)'
			}
			headers['CHIPLK0'] = {
				'title': 'CHIPLK0',
				'description': ' Log-likelihood of the sequence reads given no contamination under sequence+array method (NA if the external genotypes are unavailable)'
				}
			headers['CHIP_RH'] = {
				'title': 'CHIP_RH',
				'description': 'Estimated reference bias parameter Pr(refBase|HET) (when --chip-refBias or --chip-full is used)'
			}
			headers['CHIP_RA'] = {
				'title': 'CHIP_RA',
				'description': 'Estimated reference bias parameter Pr(refBase|HOMALT) (when --chip-refBias or --chip-full is used)'
			}

		headers['DPREF'] = {
			'title': 'DPREF',
			'description': 'Depth (Coverage) of HomRef site (based on the genotypes of (SELF_SM/BEST_SM), passing mapQ, baseQual, maxDepth thresholds.',
			'hidden': all( [ s['DPREF'] == 'NA' for s in self.verifybamid_data.values() ] ),
		}
		headers['RDPHET'] = {
			'title': 'RDPHET',
			'description': 'DPHET/DPREF, Relative depth to HomRef site at Heterozygous site.',
			'hidden': all( [ s['RDPHET'] == 'NA' for s in self.verifybamid_data.values() ] ),
		}
		headers['RDPALT'] = {
			'title': 'RDPALT',
			'description': 'DPHET/DPREF, Relative depth to HomRef site at HomAlt site.',
			'hidden': all( [ s['RDPALT'] == 'NA' for s in self.verifybamid_data.values() ] ),
		}

		tconfig = {
			'namespace': 'VerifyBAMID',
			'id': 'verifybamid-results',
		}

		# send the plot to add section function with data dict and headers
		self.add_section (
			anchor = 'verifybamid-table',
			description = 'The following values provide estimates of sample contamination. Click help for more information.',
			helptext = '''
			**Please note that `FREEMIX` is named _Contamination (Seq)_ and `CHIPMIX`
			is named _Contamination (S+A)_ in this MultiQC report.**

			VerifyBamID provides a series of information that is informative to determine
			whether the sample is possibly contaminated or swapped, but there is no single
			criteria that works for every circumstances. There are a few unmodeled factor
			in the estimation of `[SELF-IBD]/[BEST-IBD]` and `[%MIX]`, so please note that the
			MLE estimation may not always exactly match to the true amount of contamination.
			Here we provide a guideline to flag potentially contaminated/swapped samples:

			* Each sample or lane can be checked in this way.
			  When `[CHIPMIX] >> 0.02` and/or `[FREEMIX] >> 0.02`, meaning 2% or more of
			  non-reference bases are observed in reference sites, we recommend to examine
			  the data more carefully for the possibility of contamination.
			* We recommend to check each lane for the possibility of sample swaps.
			  When `[CHIPMIX] ~ 1` AND `[FREEMIX] ~ 0`, then it is possible that the sample
			  is swapped with another sample. When `[CHIPMIX] ~ 0` in `.bestSM` file,
			  `[CHIP_ID]` might be actually the swapped sample. Otherwise, the swapped
			  sample may not exist in the genotype data you have compared.
			* When genotype data is not available but allele-frequency-based estimates of
			  `[FREEMIX] >= 0.03` and `[FREELK1]-[FREELK0]` is large, then it is possible
			  that the sample is contaminated with other sample. We recommend to use
			  per-sample data rather than per-lane data for checking this for low coverage
			  data, because the inference will be more confident when there are large number
			  of bases with depth 2 or higher.

			_Copied from the [VerifyBAMID documentation](https://genome.sph.umich.edu/wiki/VerifyBamID) - see the link for more details._
			''',
			plot = table.plot(self.verifybamid_data, headers, tconfig)
		)
Ejemplo n.º 37
0
    def somalier_stats_table(self):
        """Add data to somalier stats table

        Bigger table within the somalier module, showing more stats"""

        headers = OrderedDict()

        headers["phenotype"] = {
            "title": "Phenotype",
            "description": "Sample's phenotype from pedigree info",
            "hidden": True,
        }
        headers["original_pedigree_sex"] = {
            "title": "Sex",
            "description": "Sample's sex from pedigree info",
            "scale": False,
        }
        headers["paternal_id"] = {
            "title": "Father ID",
            "description": "ID of sample's father ",
            "scale": False,
            "hidden": True,
        }
        headers["maternal_id"] = {
            "title": "Mother ID",
            "description": "ID of sample's mother",
            "scale": False,
            "hidden": True,
        }
        headers["family_id"] = {
            "title": "Family ID",
            "description": "ID of sample's family",
            "scale": False,
            "hidden": True,
        }
        headers["sex"] = {
            "title": "Inferred sex",
            "description": "Sample's inferred sex",
            "scale": False,
            "hidden": True,
        }
        headers["ancestry"] = {
            "title": "Ancestry",
            "description": "Most probable ancestry background",
            "scale": False
        }
        headers["p_ancestry"] = {
            "title": "P(Ancestry)",
            "description": "Ancestry probablitty",
            "max": 1,
            "min": 0,
            "scale": "RdYlGn",
            "format": "{:,.2f}",
        }
        headers["n_het"] = {
            "title": "HetVar",
            "description": "Heterozygous variants",
            "shared_key": "variant_count",
            "format": "{:,.0f}",
        }
        headers["n_hom_ref"] = {
            "title": "HomRefVar",
            "description": "Homozygous reference variants",
            "shared_key": "variant_count",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["n_hom_alt"] = {
            "title": "HomAltVar",
            "description": "Homozygous alternate variants",
            "shared_key": "variant_count",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["n_unknown"] = {
            "title": "NA sites",
            "description": "Unknown sites",
            "format": "{:,.0f}"
        }
        headers["depth_mean"] = {
            "title": "Mean depth",
            "description": "Mean depth of all sites",
            "scale": "RdYlGn",
            "suffix": " X",
            "hidden": True,
        }
        headers["depth_sd"] = {
            "title": "Depth std",
            "description": "Depth's standard deviation of all sites",
            "scale": "RdYlGn",
            "hidden": True,
        }
        headers["gt_depth_mean"] = {
            "title": "Sites depth",
            "description": "Mean depth of genotyped sites",
            "scale": "RdYlGn",
            "suffix": " X",
        }
        headers["gt_depth_sd"] = {
            "title": "Genot depth std",
            "description": "Depth's standard deviation of genotype sites",
            "scale": "RdYlGn",
            "suffix": " X",
            "hidden": True,
        }
        headers["ab_mean"] = {
            "title": "Allele balance",
            "description": "Mean allele balance",
            "scale": "RdYlGn",
        }
        headers["ab_std"] = {
            "title": "Allele balance std",
            "description": "Standard deviation of allele balance",
            "scale": "RdYlGn",
            "hidden": True,
        }
        headers["p_middling_ab"] = {
            "title": "Allele balance < 0.2, > 0.8",
            "description":
            "Proportion of sites with allele balance < 0.2 or > 0.8",
            "max": 1,
            "min": 0,
            "scale": "RdYlGn",
            "format": "{:,.2f}",
        }
        headers["X_het"] = {
            "title": "HetVar X",
            "description": "Heterozygous variants on X chromosome",
            "shared_key": "variant_count_xy",
            "format": "{:,.0f}",
        }
        headers["X_hom_ref"] = {
            "title": "HomRefVar X",
            "description": "Homozygous reference variants on X chromosome",
            "shared_key": "variant_count_xy",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["X_hom_alt"] = {
            "title": "HomAltVar X",
            "description": "Homozygous alternate variants on X chromosome",
            "shared_key": "variant_count_xy",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["X_n"] = {
            "title": "Sites X",
            "description": "Total sites on X chromosome",
            "shared_key": "variant_count_xy",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["X_depth_mean"] = {
            "title": "Mean depth X",
            "description": "Mean depth of sites on X chromosome",
            "scale": "RdYlGn",
            "suffix": " X",
        }
        headers["Y_n"] = {
            "title": "Sites Y",
            "description": "Total sites on Y chromosome",
            "shared_key": "variant_count_xy",
            "format": "{:,.0f}",
            "hidden": True,
        }
        headers["Y_depth_mean"] = {
            "title": "Mean depth Y",
            "description": "Mean depth of sites on Y chromosome",
            "scale": "RdYlGn",
            "suffix": " X",
        }

        t_config = {
            "id": "somalier_stats",
            "namespace": "Somalier",
            "title": "Somalier: Statistics",
            "no_beeswarm": True,
            "raw_data_fn": "multiqc_somalier_stats",
        }

        self.add_section(
            name="Statistics",
            anchor="somalier-stats",
            description="Various statistics from the somalier report.",
            plot=table.plot(self.somalier_data, headers, t_config),
        )
Ejemplo n.º 38
0
    def run_metrics_details_table(self,data):
        headers = OrderedDict()
        headers['Surface'] = {
            'title': 'Surface',
            'description': ''
        }
        headers['Tiles'] = {
            'title': 'Tiles',
            'description': 'The number of tiles per lane.',
            'hidden': True
        }
        headers['Density'] = {
            'title': 'Density',
            'description': 'The density of clusters (in thousands per mm2) detected by image analysis, +/- 1 standard deviation.',
            'hidden': True
        }
        headers['Cluster PF'] = {
            'title': 'Cluster PF (%)',
            'description': 'The percentage of clusters passing filtering, +/- 1 standard deviation.',
            'suffix': '%',
        }
        headers['Phased'] = {
            'title': 'Phased (%)',
            'description': 'The value used by RTA for the percentage of molecules in a cluster for which sequencing falls behind (phasing) or jumps ahead (prephasing) the current cycle within a read.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd'
        }
        headers['Prephased'] = {
            'title': 'Prephased (%)',
            'description': 'The value used by RTA for the percentage of molecules in a cluster for which sequencing falls behind (phasing) or jumps ahead (prephasing) the current cycle within a read.',
            'format': '{:.,2f}',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd'
        }
        headers['Reads'] = {
            'title': '{} Reads'.format(config.read_count_prefix),
            'description': 'The number of clusters ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
        }
        headers['Reads PF'] = {
            'title': '{} PF Reads'.format(config.read_count_prefix),
            'description': 'The number of passing filter clusters ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
        }
        headers['Cycles Error'] = {
            'title': 'Cycles Error',
            'description': 'The number of cycles that have been error-rated using PhiX, starting at cycle 1.',
            'format': '{:.,0f}',
        }
        headers['Yield'] = {
            'title': '{} Bp Yield'.format(config.base_count_prefix),
            'description': 'The number of bases sequenced which passed filter ({})'.format(config.base_count_desc),
            'scale': 'PuOr',
            'shared_key': 'base_count'
        }
        headers['Aligned'] = {
            'title': 'Aligned (%)',
            'description': 'The percentage that aligned to the PhiX genome.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'PiYG'
        }
        headers['Error'] = {
            'title': 'Error Rate (%)',
            'description': 'The calculated error rate, as determined by the PhiX alignment.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd'
        }
        headers['Error (35)'] = {
            'title': 'Error Rate 35 Cycles (%)',
            'description': 'The calculated error rate for cycles 1-35.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd',
            'hidden': True
        }
        headers['Error (75)'] = {
            'title': 'Error Rate 75 Cycles (%)',
            'description': 'The calculated error rate for cycles 1-75.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd',
            'hidden': True
        }
        headers['Error (100)'] = {
            'title': 'Error Rate 100 Cycles (%)',
            'description': 'The calculated error rate for cycles 1-100.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'OrRd',
            'hidden': True
        }
        headers['Intensity C1'] = {
            'title': 'Intensity Cycle 1',
            'description': 'The intensity statistic at cycle 1.',
        }
        headers['%>=Q30'] = {
            'title': '%>=Q30',
            'description': 'The percentage of bases with a quality score of 30 or higher, respectively.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'RdYlGn'
        }
        table_config = {
            'namespace': 'interop',
            'id': 'interop-runmetrics-detail-table',
            'table_title': 'Sequencing Lane Statistics',
            'col1_header': 'Run - Lane - Read',
        }

        tdata = {}
        for s_name in data:
            for key in data[s_name]['details']:
                tdata["{} - {}".format(s_name,key)]=data[s_name]['details'][key]

        return table.plot(tdata, headers, table_config)
Ejemplo n.º 39
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name='Stacks',
            anchor='stacks',
            href="http://catchenlab.life.illinois.edu/stacks/",
            info=
            "A software for analyzing restriction enzyme-based data (e.g. RAD-seq)."
        )

        self.gsheaders = OrderedDict()
        self.gsheaders['n_loci'] = {
            'title': '# loci',
            'description': 'Number of loci built',
            'format': '{:,.i}',
            'scale': 'RdYlGn'
        }
        self.gsheaders['n_used_fw_reads'] = {
            'title': 'K reads used',
            'modify': lambda x: float(x) / 1000.0,
            'description': 'Number of thousand reads used',
            'scale': 'BuGn'
        }
        self.gsheaders['mean_cov'] = {
            'title': 'cov',
            'suffix': 'X',
            'description': 'Mean sequence coverage at locus',
            'scale': 'BuPu',
        }
        self.gsheaders['mean_cov_ns'] = {
            'title': 'weighted cov',
            'suffix': 'X',
            'description':
            'The coverage at each locus is weighted by the number of samples present at that locus (i.e. coverage at shared loci counts more)',
            'scale': 'YlGn',
        }

        self.sheaders = OrderedDict()
        self.sheaders['# Pop ID'] = {
            'title': 'PopID',
            'description':
            'Population ID as defined in the Population Map file.',
            'scale': False,
            'format': '{:,.s}'
        }
        self.sheaders['Private'] = {
            'title': 'Private',
            'description': 'Number of private alleles in this population.',
            'scale': 'PuBu',
            'hidden': True
        }
        self.sheaders['Num_Indv'] = {
            'title': '# Indv',
            'description':
            'Mean number of individuals per locus in this population.',
            'scale': 'YlGn'
        }
        self.sheaders['P'] = {
            'title': 'P',
            'description':
            'Mean frequency of the most frequent allele at each locus in this population.',
            'scale': 'PuBu',
            'min': 0,
            'max': 1
        }
        self.sheaders['Obs_Het'] = {
            'title': 'Obs Het',
            'description': 'Mean observed heterozygosity in this population.',
            'scale': 'YlGn',
            'min': 0,
            'max': 1,
        }
        self.sheaders['Obs_Hom'] = {
            'title': 'Obs Hom',
            'description': 'Mean observed homozygosity in this population.',
            'scale': 'PuBu',
            'min': 0,
            'max': 1,
            'hidden': True
        }
        self.sheaders['Exp_Hom'] = {
            'title': 'Exp_Hom',
            'description': 'Mean expected homozygosity in this population.',
            'scale': 'YlGn',
            'min': 0,
            'max': 1,
            'hidden': True
        }
        self.sheaders['Exp_Het'] = {
            'title': 'Exp Het',
            'description': 'Mean expected heterozygosity in this population.',
            'scale': 'PuBu',
            'min': 0,
            'max': 1
        }
        self.sheaders['Pi'] = {
            'title': 'Pi',
            'description': 'Mean value of &#960; in this population.',
            'scale': 'YlGn',
            'min': 0,
            'max': 1
        }
        self.sheaders['Fis'] = {
            'title': 'Fis',
            'description': 'Mean measure of Fis in this population.',
            'scale': 'PuOr',
            'min': -1,
            'max': 1
        }

        num_files = 0
        # Parse gstacks data
        self.cov_data = OrderedDict()
        for f in self.find_log_files('stacks/gstacks'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            try:
                self.cov_data.update(self.parse_gstacks(f['f'], s_name))
                num_files += 1
            except:
                log.error('Could not parse gstacks.distribs file in {}'.format(
                    f['s_name']))

        # Parse populations data
        self.distribs_loci = OrderedDict()
        self.distribs_snps = OrderedDict()
        for f in self.find_log_files('stacks/populations'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            i, j = self.parse_populations(f['f'], s_name)
            try:
                self.distribs_loci.update(i)
                self.distribs_snps.update(j)
                num_files += 1
            except:
                log.error('Could not parse population.log.distribs file in {}'.
                          format(f['s_name']))

        # Parse sumstats file
        self.sumstats_data = OrderedDict()
        for f in self.find_log_files('stacks/sumstats'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            try:
                self.sumstats_data.update(self.parse_sumstats(f['f'], s_name))
                num_files += 1
            except:
                log.error(
                    'Could not parse populations.sumstats_summary file in {}'.
                    format(f['s_name']))

        # Ignore samples
        self.cov_data = self.ignore_samples(self.cov_data)
        self.distribs_loci = self.ignore_samples(self.distribs_loci)
        self.distribs_snps = self.ignore_samples(self.distribs_snps)
        self.sumstats_data = self.ignore_samples(self.sumstats_data)

        if len(self.cov_data) == 0 and len(self.sumstats_data) == 0 and len(
                self.distribs_loci) == 0:
            raise UserWarning
        log.info("Found {} reports".format(num_files))

        # Write parsed report data to a file
        self.write_data_file(self.cov_data, 'multiqc_stacks_cov')
        self.write_data_file(self.sumstats_data, 'multiqc_stacks_sumstats')

        ### Write the sample table
        config_table = {'id': 'gstacks_table', 'namespace': 'stacks'}
        self.add_section(
            name='Sample statistics',
            anchor='stacks-gstacks',
            description='The sample specific statistics for Stacks',
            helptext=
            '''**Note!** The sample names have the following scheme `<run folder name> | <input fastq file prefix>`.
                        This data is obtained from the gstacks program run after builing sample and catalog loci merge
                        paired-ends and call variants.
                        These numbers are obtained from the `gstacks.log.distribs` file''',
            plot=table.plot(self.cov_data, self.gsheaders, config_table))
        # Write population sumstats table
        config_table = {'id': 'sumstats_table', 'namespace': 'stacks'}
        self.add_section(
            name='Population summary statistics',
            anchor='stacks-sumstats',
            description=
            'Population statistics as calculated from variant sites found in this run',
            helptext=
            '''**Note!** The sample names have the following scheme `<run folder name> | <population ID>`,
                        where the population ID is defined in the input population map file.
                        This information is obtained from the Stacks program `population` and the file populations.sumstats_summary.tsv
                        ''',
            plot=table.plot(self.sumstats_data, self.sheaders, config_table))
        config_distribs = {
            'id':
            'distribs_plot',
            'title':
            'Stacks: Population plots',
            'namespace':
            'stacks',
            'tt_label':
            '{point.y} loci, {point.x} samples/SNPs',
            'ylab':
            '# loci',
            'data_labels': [{
                'name': 'Samples per loci',
                'ylab': '# loci',
                'xlab': '# samples'
            }, {
                'name': 'SNPs per loci',
                'ylab': '# loci',
                'xlab': '# SNPs'
            }]
        }
        self.add_section(
            name='Population plots',
            anchor='stacks-distribs',
            description=
            'Plots showing, 1) the number of loci shared by number of samples and 2) the number of SNPs per sample',
            helptext=
            '''The distributions are obtained from the Stacks program `populations` and it's output file `populations.log.distribs`.
            These numbers are Stacks' post-filtering.''',
            plot=linegraph.plot([self.distribs_loci, self.distribs_snps],
                                config_distribs))
Ejemplo n.º 40
0
    def __init__(self):
        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name="K-mer Analysis Toolkit",
            anchor="kat",
            href="https://github.com/TGAC/KAT",
            info="is an toolkit for analysing sequencing data via its k-mer spectra.",
        )

        # Find and load any KAT dist analysis reports
        self.kat_data = dict()
        for f in self.find_log_files("kat"):
            s_name = self.clean_s_name(f["s_name"].replace(".dist_analysis", ""), f)
            content = json.loads(f["f"])
            self.kat_data[s_name] = self.parse_kat_report(content)

        # Filter to strip out ignored sample names
        self.kat_data = self.ignore_samples(self.kat_data)

        if len(self.kat_data) == 0:
            raise UserWarning

        log.info("Found {} reports".format(len(self.kat_data)))

        # Write parsed report data to a file
        self.write_data_file(self.kat_data, "multiqc_kat")

        headers = OrderedDict()
        headers["kmer_peaks"] = {
            "title": "# of Kmer Peaks",
            "description": "Number of peaks identified in the K-mer spectra",
            "scale": False,
            "format": "{:,.0f}",
        }
        headers["gc_peaks"] = {
            "title": "# of GC Peaks",
            "description": "Number of peaks identified in the GC distribution",
            "scale": False,
            "format": "{:,.0f}",
        }
        headers["est_genome_size"] = {
            "title": "Est. genome Size",
            "description": "Estimated Genome Size based on K-mer spectra",
            "scale": "BuPu",
            "format": "{:,.0f}",
        }
        headers["mean_kmer_freq"] = {
            "title": "Mean K-mer Freq.",
            "description": "Mean K-mer Frequency, provides an estimate of sequencing coverage",
            "scale": "Greens",
            "format": "{:,.0f}",
            "suffix": "x",
        }

        kat_config = {
            "namespace": "KAT",
        }

        # Basic Stats Table
        self.add_section(
            name="KAT Distribution Analysis",
            anchor="kat-first",
            description="Table showing k-mer coverage distributions and if available GC distributions",
            helptext="This table can give a quick idea of potential contaminants that can be identified via unexpected numbers of k-mer or gc peaks in the data",
            plot=table.plot(self.kat_data, headers, kat_config),
        )
Ejemplo n.º 41
0
    def parse_bamPEFragmentSize(self):
        """Find bamPEFragmentSize output. Supports the --table option"""
        self.deeptools_bamPEFragmentSize = dict()
        for f in self.find_log_files("deeptools/bamPEFragmentSizeTable"):
            parsed_data = self.parseBamPEFile(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_bamPEFragmentSize:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_bamPEFragmentSize[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section="bamPEFragmentSize")

        self.deeptools_bamPEFragmentSize = self.ignore_samples(self.deeptools_bamPEFragmentSize)

        if len(self.deeptools_bamPEFragmentSize) > 0:
            headersSE = OrderedDict()
            headersSE["Reads Sampled"] = {
                "title": "# Sampled",
                "description": "Number of reads sampled",
                "format": "{:,.0f}",
            }
            headersSE["Read Len. Min."] = {
                "title": "Min",
                "description": "Minimum read length",
                "format": "{:,.0f}",
                "shared_key": "read_length",
            }
            headersSE["Read Len. 1st. Qu."] = {
                "title": "1st Quartile",
                "description": "1st quartile read length",
                "format": "{:,.0f}",
                "shared_key": "read_length",
            }
            headersSE["Read Len. Mean"] = {
                "title": "Mean",
                "description": "Mean read length",
                "shared_key": "read_length",
            }
            headersSE["Read Len. Median"] = {
                "title": "Median",
                "description": "Median read length",
                "format": "{:,.0f}",
                "shared_key": "read_length",
            }
            headersSE["Read Len. 3rd Qu."] = {
                "title": "3rd Quartile",
                "description": "3rd quartile read length",
                "format": "{:,.0f}",
                "shared_key": "read_length",
            }
            headersSE["Read Len. Max"] = {
                "title": "Max",
                "description": "Maximum read length",
                "format": "{:,.0f}",
                "shared_key": "read_length",
            }
            headersSE["Read Len. Std."] = {
                "title": "Std. Dev.",
                "description": "read length standard deviation",
                "shared_key": "read_length",
            }
            headersSE["Read Med. Abs. Dev."] = {
                "title": "MAD",
                "description": "read length median absolute deviation",
                "shared_key": "read_length",
            }
            config = {"namespace": "deepTools bamPEFragmentSize"}
            self.add_section(
                name="Read length metrics",
                anchor="deeptools_readlengths",
                plot=table.plot(self.deeptools_bamPEFragmentSize, headersSE, config),
            )

            headersPE = OrderedDict()
            headersPE["Frag. Sampled"] = {
                "title": "# Sampled",
                "description": "Number of fragments sampled",
                "format": "{:,.0f}",
            }
            headersPE["Frag. Len. Min."] = {
                "title": "Min",
                "description": "Minimum fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. 1st. Qu."] = {
                "title": "1st Quartile",
                "description": "1st quartile fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. Mean"] = {
                "title": "Mean",
                "description": "Mean fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. Median"] = {
                "title": "Median",
                "description": "Median fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. 3rd Qu."] = {
                "title": "3rd Quartile",
                "description": "3rd quartile fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. Max"] = {
                "title": "Max",
                "description": "Maximum fragment length",
                "format": "{:,.0f}",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Len. Std."] = {
                "title": "Std. Dev.",
                "description": "Fragment length standard deviation",
                "shared_key": "frag_length",
            }
            headersPE["Frag. Med. Abs. Dev."] = {
                "title": "MAD",
                "description": "Fragment length median absolute deviation",
                "shared_key": "frag_length",
            }

            # Are there any PE datasets?
            PE = False
            for k, v in self.deeptools_bamPEFragmentSize.items():
                if "Frag. Len. Min." in v:
                    PE = True
                    break
            if PE:
                self.add_section(
                    name="Fragment length metrics",
                    anchor="deeptools_fragmentlengths",
                    plot=table.plot(self.deeptools_bamPEFragmentSize, headersPE, config),
                )

            # Read length plot
            config = {
                "data_labels": [
                    {
                        "name": "Read length distribution",
                        "title": "Read length distribution",
                        "ylab": "Read length (bases)",
                    },
                    {
                        "name": "Fragment length distribution",
                        "title": "Fragment length distribution",
                        "ylab": "Fragment length (bases)",
                    },
                ],
                "id": "deeptools_readlengthsPlot",
                "title": "deepTools: Read/Fragment length distribution",
                "namespace": "deepTools bamPEFragmentSize",
                "ylab": "Read length (bases)",
                "xlab": "Percentile",
            }
            SE = dict()
            PE = dict()
            for k, v in self.deeptools_bamPEFragmentSize.items():
                SE[k] = {
                    0: v["Read Len. Min."],
                    10: v["Read Len. 10%"],
                    20: v["Read Len. 20%"],
                    25: v["Read Len. 1st. Qu."],
                    30: v["Read Len. 30%"],
                    40: v["Read Len. 40%"],
                    50: v["Read Len. Median"],
                    60: v["Read Len. 60%"],
                    70: v["Read Len. 70%"],
                    75: v["Read Len. 3rd Qu."],
                    80: v["Read Len. 80%"],
                    90: v["Read Len. 90%"],
                    99: v["Read Len. 99%"],
                    100: v["Read Len. Max"],
                }
                if "Frag. Len. Min." not in v:
                    continue
                PE[k] = {
                    0: v["Frag. Len. Min."],
                    10: v["Frag. Len. 10%"],
                    20: v["Frag. Len. 20%"],
                    25: v["Frag. Len. 1st. Qu."],
                    30: v["Frag. Len. 30%"],
                    40: v["Frag. Len. 40%"],
                    50: v["Frag. Len. Median"],
                    60: v["Frag. Len. 60%"],
                    70: v["Frag. Len. 70%"],
                    75: v["Frag. Len. 3rd Qu."],
                    80: v["Frag. Len. 80%"],
                    90: v["Frag. Len. 90%"],
                    99: v["Frag. Len. 99%"],
                    100: v["Frag. Len. Max"],
                }
            self.add_section(
                name="Read/fragment length distribution",
                anchor="deeptools_fragmentlengths_dist",
                plot=linegraph.plot([SE, PE], config),
            )

        return len(self.deeptools_bamPEFragmentSize)
Ejemplo n.º 42
0
    def __init__(self):

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name='Long Ranger',
            anchor='longranger',
            href="https://www.10xgenomics.com/",
            info=
            "A set of analysis pipelines that perform sample demultiplexing, "
            "barcode processing, alignment, quality control, variant calling, phasing, "
            "and structural variant calling.")

        def try_float_lambda(x, func, base):
            try:
                if func == '*':
                    return float(x) * base
                elif func == '/':
                    return float(x) / base
                else:
                    return x
            except:
                return x

        self.headers = OrderedDict()
        self.headers['large_sv_calls'] = {
            'title': 'Large SVs',
            'description':
            'Large structural variants called by Longranger. Not including blacklisted regions.',
            'format': '{:,.0f}',
            'scale': 'PuRd'
        }
        self.headers['short_deletion_calls'] = {
            'title': 'Short dels',
            'description': 'Short deletions called by Longranger.',
            'format': '{:,.0f}',
            'scale': 'PuRd',
            'hidden': True
        }
        self.headers['genes_phased_lt_100kb'] = {
            'title': 'genes phased < 100kb',
            'description':
            'Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'suffix': '%',
            'scale': 'YlOrRd',
            'hidden': True
        }
        self.headers['longest_phase_block'] = {
            'title': 'Longest phased',
            'description': 'Size of the longest phase block, in base pairs',
            'scale': 'YlOrRd',
            'modify': lambda x: try_float_lambda(x, '/', 1000000.0),
            'suffix': 'Mbp',
            'hidden': True
        }
        self.headers['n50_phase_block'] = {
            'title': 'N50 phased',
            'description':
            'N50 length of the called phase blocks, in base pairs.',
            'modify': lambda x: try_float_lambda(x, '/', 1000000.0),
            'suffix': 'Mbp',
            'scale': 'YlOrRd',
            'hidden': True
        }
        self.headers['snps_phased'] = {
            'title': 'SNPs phased',
            'description': 'Percentage of called SNPs that were phased.',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'suffix': '%',
            'scale': 'PuRd',
            'hidden': True
        }
        self.headers['median_insert_size'] = {
            'title': 'Insert size',
            'description': 'Median insert size of aligned read pairs.',
            'format': '{:,.0f}',
            'suffix': 'bp',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['on_target_bases'] = {
            'title': 'On target',
            'description':
            'Percentage of aligned bases mapped with the target regions in targeted mode. Only bases inside the intervals of target BED file are counted.',
            'suffix': '%',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'scale': 'Greens'
        }
        self.headers['zero_coverage'] = {
            'title': 'Zero cov',
            'description':
            'Percentage of non-N bases in the genome with zero coverage.',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'suffix': '%',
            'max': 100.0,
            'min': 0.0,
            'scale': 'RdGy-rev'
        }
        self.headers['mean_depth'] = {
            'title': 'Depth',
            'description':
            'Mean read depth, including PCR duplicate reads. In WGS mode, this is measured across the genome; in targeted mode, this is the measure inside targeted regions.',
            'suffix': 'X',
            'scale': 'PuBu'
        }
        self.headers['pcr_duplication'] = {
            'title': 'PCR Dup',
            'description':
            'Percentage of reads marked as PCR duplicates. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.',
            'suffix': '%',
            'min': 15.0,
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'scale': 'RdGy-rev',
            'hidden': True
        }
        self.headers['mapped_reads'] = {
            'title': 'Mapped',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'suffix': '%',
            'description':
            'Percentage of input reads that were mapped to the reference genome.',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['number_reads'] = {
            'title': 'M Reads',
            'modify': lambda x: try_float_lambda(x, '/', 1000000.0),
            'description':
            'Total number of reads supplied to Long Ranger. (millions)',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['molecule_length_mean'] = {
            'title': 'Mol size',
            'description':
            'The length-weighted mean input DNA length in base pairs.',
            'modify': lambda x: try_float_lambda(x, '/', 1000.0),
            'suffix': 'Kbp',
            'scale': 'YlGn'
        }
        self.headers['molecule_length_stddev'] = {
            'title': 'Mol stddev',
            'description':
            'The length-weighted standard deviation of the input DNA length distribution in base pairs.',
            'modify': lambda x: try_float_lambda(x, '/', 1000.0),
            'suffix': 'Kbp',
            'scale': 'YlGn',
            'hidden': True
        }
        self.headers['n50_linked_reads_per_molecule'] = {
            'title': 'N50 read per mol.',
            'description':
            'The N50 number of read-pairs per input DNA molecule. Half of read-pairs came from molecules with this many or greater read-pairs.',
            'scale': 'BuGn',
            'hidden': True
        }
        self.headers['r1_q30_bases_fract'] = {
            'title': '% R1 >= Q30',
            'description':
            'Percentage of bases in R1 with base quality >= 30.',
            'hidden': True,
            'suffix': '%',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'scale': 'Purples'
        }
        self.headers['r2_q30_bases_fract'] = {
            'title': '% R2 >= Q30',
            'description':
            'Percentage of bases in R2 with base quality >= 30.',
            'suffix': '%',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'scale': 'Purples',
            'hidden': True
        }
        self.headers['bc_on_whitelist'] = {
            'title': 'Valid BCs',
            'description':
            'The Percentage of reads that carried a valid 10x barcode sequence.',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'suffix': '%',
            'scale': 'BuPu',
            'hidden': True,
        }
        self.headers['bc_q30_bases_fract'] = {
            'title': 'BC Q30',
            'description':
            'Percentage of bases in the barcode with base quality >= 30.',
            'suffix': '%',
            'modify': lambda x: try_float_lambda(x, '*', 100.0),
            'scale': 'Purples',
            'hidden': True
        }
        self.headers['bc_mean_qscore'] = {
            'title': 'BC Qscore',
            'description': 'The mean base quality value on the barcode bases.',
            'scale': 'BuPu',
            'hidden': True
        }
        self.headers['mean_dna_per_gem'] = {
            'title': 'DNA per gem',
            'description':
            'The average number of base pairs of genomic DNA loaded into each GEM. This metric is based on the observed extents of read-pairs on each molecule.',
            'modify': lambda x: try_float_lambda(x, '/', 1000000.0),
            'suffix': 'Mbp',
            'scale': 'OrRd',
            'hidden': True
        }
        self.headers['gems_detected'] = {
            'title': 'M Gems',
            'description':
            'The number of Chromium GEMs that were collected and which generated a non-trivial number of read-pairs. (millions)',
            'modify': lambda x: try_float_lambda(x, '/', 1000000.0),
            'scale': 'OrRd',
        }
        self.headers['corrected_loaded_mass_ng'] = {
            'title': 'Loaded (corrected)',
            'description':
            'The estimated number of nanograms of DNA loaded into the input well of the Chromium chip. This metric is calculated by measuring the mean amount of DNA covered by input molecules in each GEM, then multiplying by the ratio of the chip input to the sample volume in each GEM.',
            'suffix': 'ng',
            'scale': 'RdYlGn'
        }
        self.headers['loaded_mass_ng'] = {
            'title': 'Loaded',
            'description':
            'This metric was found to overestimate the true loading by a factor of 1.6, due primarily to denaturation of the input DNA.',
            'suffix': 'ng',
            'scale': 'RdYlGn'
        }
        self.headers['instrument_ids'] = {
            'title': 'Instrument ID',
            'description':
            'The list of instrument IDs used to generate the input reads.',
            'scale': False,
            'hidden': True
        }
        self.headers['longranger_version'] = {
            'title': 'Long Ranger Version',
            'description':
            'The version of the Longranger software used to generate the results.',
            'scale': False
        }

        ### Parse the data
        self.longranger_data = dict()
        self.paths_dict = dict()
        for f in self.find_log_files('longranger/invocation'):
            sid = self.parse_invocation(f['f'])
            self.paths_dict[os.path.basename(f['root'])] = sid

        running_name = 1
        for f in self.find_log_files('longranger/summary'):
            data = self.parse_summary(f['f'])
            updir, _ = os.path.split(f['root'])
            base_updir = os.path.basename(updir)
            sid = 'longranger#{}'.format(running_name)
            if base_updir in self.paths_dict.keys():
                sid = self.paths_dict[base_updir]
            else:
                log.debug('Did not find _invocation file: {}'.format(f['fn']))
                running_name += 1

            self.longranger_data[sid] = data

        # Filter to strip out ignored sample names
        self.longranger_data = self.ignore_samples(self.longranger_data)

        if len(self.longranger_data) == 0:
            raise UserWarning
        log.info("Found {} reports".format(len(self.longranger_data.keys())))

        # Write parsed report data to a file
        self.write_data_file(self.longranger_data, 'multiqc_longranger')

        # Add a longranger versions column if not all the same
        longranger_versions = set(
            [d['longranger_version'] for d in self.longranger_data.values()])
        version_str = ''
        if len(longranger_versions) == 1:
            version_str = " All samples were processed using Longranger version {}".format(
                list(longranger_versions)[0])
            del (self.headers['longranger_version'])

        ### Write the table
        config_table = {'id': 'longranger_table', 'namespace': 'longranger'}
        self.add_section (
            name = 'Run stats',
            anchor = 'longranger-run-stats',
            description = 'Statistics gathered from Longranger reports. ' \
                    'There are more columns available but they are hidden by default.' + version_str,
            helptext = '''Parses the files `summary.csv` and `_invocation` found in the
                    output directory of Longranger. If `_invocation` is not found
                    the sample IDs will be missing and they will be given a running
                    number. E.g., `longranger#1` and `longranger#2`.''',
            plot = table.plot(self.longranger_data, self.headers, config_table)
        )

        ### Bar plot of phasing stats
        phase_pdata = {}
        snps_phased_pct = {}
        genes_phased_pct = {}
        for s_name in self.longranger_data:
            try:
                phase_pdata[s_name] = {
                    'longest_phase_block':
                    float(self.longranger_data[s_name]['longest_phase_block']),
                    'n50_phase_block':
                    float(self.longranger_data[s_name]['n50_phase_block'])
                }
            except:
                pass
            try:
                snps_phased_pct[s_name] = {
                    'snps_phased_pct':
                    float(self.longranger_data[s_name]['snps_phased']) * 100.0
                }
            except:
                pass
            try:
                genes_phased_pct[s_name] = {
                    'genes_phased_pct':
                    float(
                        self.longranger_data[s_name]['genes_phased_lt_100kb'])
                    * 100.0
                }
            except:
                pass
        phase_plot_cats = [OrderedDict(), OrderedDict(), OrderedDict()]
        phase_plot_cats[0]['longest_phase_block'] = {
            'name': 'Longest Phase Block'
        }
        phase_plot_cats[0]['n50_phase_block'] = {'name': 'N50 of Phase Blocks'}
        phase_plot_cats[1]['snps_phased_pct'] = {'name': '% SNPs Phased'}
        phase_plot_cats[2]['genes_phased_pct'] = {
            'name': '% Genes < 100kbp in a single phase block'
        }
        if len(phase_pdata) > 0:
            self.add_section(
                name='Phasing',
                anchor='longranger-phasing',
                description=
                'Phasing performance from Long Ranger. Genes are only considered if &le; 100kbp in length and with at least one heterozygous SNP.',
                helptext='''
                        * Longest phased
                            * Size of the longest phase block, in base pairs
                        * N50 phased
                            * N50 length of the called phase blocks, in base pairs.
                        * % SNPs phased
                            * Percentage of called SNPs that were phased.
                        * % Genes Phased
                            * Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.
                        ''',
                plot=bargraph.plot(
                    [phase_pdata, snps_phased_pct, genes_phased_pct],
                    phase_plot_cats, {
                        'id':
                        'longranger-phasing-plot',
                        'title':
                        'Long Ranger: Phasing Statistics',
                        'data_labels':
                        [{
                            'name': 'N50 Phased',
                            'ylab': 'N50 of called phase blocks (bp)'
                        }, {
                            'name': '% SNPs Phased',
                            'ylab': '% SNPs Phased',
                            'ymax': 100
                        }, {
                            'name': '% Genes Phased',
                            'ylab': '% Genes Phased',
                            'ymax': 100
                        }],
                        'cpswitch':
                        False,
                        'stacking':
                        None,
                        'ylab':
                        'N50 of called phase blocks (bp)'
                    }))

        ### Bar plot of mapping statistics
        mapping_counts_data = {}
        for s_name in self.longranger_data:
            mapped_reads = float(
                self.longranger_data[s_name]['number_reads']) * float(
                    self.longranger_data[s_name]['mapped_reads'])
            unmapped_reads = float(
                self.longranger_data[s_name]['number_reads']) - mapped_reads
            dup_reads = mapped_reads * float(
                self.longranger_data[s_name]['pcr_duplication'])
            unique_reads = mapped_reads - dup_reads
            mapping_counts_data[s_name] = {
                'unique_reads': unique_reads,
                'dup_reads': dup_reads,
                'unmapped_reads': unmapped_reads
            }
        mapping_counts_cats = OrderedDict()
        mapping_counts_cats['unique_reads'] = {
            'name': 'Uniquely Aligned Reads',
            'color': '#437bb1'
        }
        mapping_counts_cats['dup_reads'] = {
            'name': 'PCR Duplicate Aligned Reads',
            'color': '#7cb5ec'
        }
        mapping_counts_cats['unmapped_reads'] = {
            'name': 'Unaligned Reads',
            'color': '#7f0000'
        }
        self.add_section(
            name='Alignment',
            anchor='longranger-alignment',
            description=
            'Long Ranger alignment against the reference genome. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.',
            plot=bargraph.plot(
                mapping_counts_data, mapping_counts_cats, {
                    'id': 'longranger-alignment-plot',
                    'title': 'Long Ranger: Alignment Statistics',
                    'ylab': 'Reads Counts',
                    'cpswitch_counts_label': 'Read Counts',
                }))
Ejemplo n.º 43
0
    def parse_bcftools_stats(self):
        """
        Find bcftools stats logs and parse their data
          Bcftools stats reports contain 'sets' of data, which can
          have multiple vcf files each (but usually don't). Here,
          we treat each 'set' as a MultiQC sample, taking the first
          input filename for each set as the name.
        """
        collapse_complementary = getattr(config, 'bcftools', {}).get('collapse_complementary_changes', False)
        if collapse_complementary:
            types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T']
        else:
            types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T',
                     'G>A', 'G>C', 'G>T', 'T>A', 'T>C', 'T>G']

        self.bcftools_stats = dict()
        self.bcftools_stats_indels = dict()
        self.bcftools_stats_vqc_snp = dict()
        self.bcftools_stats_vqc_transi = dict()
        self.bcftools_stats_vqc_transv = dict()
        self.bcftools_stats_vqc_indels = dict()
        depth_data = dict()
        for f in self.find_log_files('bcftools/stats'):
            s_names = list()
            for line in f['f'].splitlines():
                s = line.split("\t")
                # Get the sample names - one per 'set'
                if s[0] == "ID":
                    s_name = self.clean_s_name(s[2], f['root'])
                    s_names.append(s_name)
                    if s_name in self.bcftools_stats:
                        log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
                    self.add_data_source(f, s_name, section='stats')
                    self.bcftools_stats[s_name] = dict()
                    self.bcftools_stats_indels[s_name] = dict()
                    self.bcftools_stats_vqc_snp[s_name] = dict()
                    self.bcftools_stats_vqc_transi[s_name] = dict()
                    self.bcftools_stats_vqc_transv[s_name] = dict()
                    self.bcftools_stats_vqc_indels[s_name] = dict()
                    depth_data[s_name] = OrderedDict()
                    self.bcftools_stats_indels[s_name][0] = None # Avoid joining line across missing 0

                # Parse key stats
                if s[0] == "SN" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    field = s[2].strip()[:-1]
                    field = field.replace(' ', '_')
                    value = float(s[3].strip())
                    self.bcftools_stats[s_name][field] = value

                # Parse transitions/transversions stats
                if s[0] == "TSTV" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = ['ts', 'tv', 'tstv', 'ts_1st_ALT', 'tv_1st_ALT', 'tstv_1st_ALT']
                    for i, f in enumerate(fields):
                        value = float(s[i+2].strip())

                        self.bcftools_stats[s_name][f] = value

                # Parse substitution types
                if s[0] == "ST" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]

                    rc = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                    change = s[2].strip()
                    if change not in types:
                        change = '>'.join(rc[n] for n in change.split('>'))

                    field = 'substitution_type_{}'.format(change)
                    value = float(s[3].strip())
                    if field not in self.bcftools_stats[s_name]:
                        self.bcftools_stats[s_name][field] = 0
                    self.bcftools_stats[s_name][field] += value

                # Indel length distributions
                if s[0] == "IDD" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    length = float(s[2].strip())
                    count = float(s[3].strip())
                    self.bcftools_stats_indels[s_name][length] = count

                # Per-sample counts
                if s[0] == "PSC" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = ['variations_hom', 'variations_het']
                    for i, f in enumerate(fields):
                        self.bcftools_stats[s_name][f] = int(s[i + 4].strip())

                # Depth plots
                if s[0] == "DP" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    bin_name = s[2].strip()
                    percent_sites = float(s[-1].strip())
                    depth_data[s_name][bin_name] = percent_sites

                # Variant Qualities
                if s[0] == "QUAL" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    quality = float(s[2].strip())
                    self.bcftools_stats_vqc_snp[s_name][quality] = float(s[3].strip())
                    self.bcftools_stats_vqc_transi[s_name][quality] = float(s[4].strip())
                    self.bcftools_stats_vqc_transv[s_name][quality] = float(s[5].strip())
                    self.bcftools_stats_vqc_indels[s_name][quality] = float(s[6].strip())

        # Filter to strip out ignored sample names
        self.bcftools_stats = self.ignore_samples(self.bcftools_stats)

        if len(self.bcftools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.bcftools_stats, 'multiqc_bcftools_stats')

            # Stats Table
            stats_headers = self.bcftools_stats_genstats_headers()
            if getattr(config, 'bcftools', {}).get('write_general_stats', True):
                self.general_stats_addcols(self.bcftools_stats, stats_headers, 'Bcftools Stats')
            if getattr(config, 'bcftools', {}).get('write_separate_table', False):
                self.add_section(
                    name='Bcftools Stats',
                    anchor='bcftools-stats',
                    plot=table.plot(self.bcftools_stats, stats_headers))

            # Make bargraph plot of substitution types
            keys = OrderedDict()
            for t in types:
                keys['substitution_type_{}'.format(t)] = {'name': t}
            pconfig = {
                'id': 'bcftools-stats-subtypes',
                'title': 'Bcftools Stats: Substitutions',
                'ylab': '# Substitutions',
                'cpswitch_counts_label': 'Number of Substitutions'
            }
            self.add_section (
                name = 'Variant Substitution Types',
                anchor = 'bcftools-stats',
                plot = bargraph.plot(self.bcftools_stats, keys, pconfig)
            )

            # Make histograms of variant quality
            if len(self.bcftools_stats_vqc_snp) > 0:
                pconfig = {
                    'id': 'bcftools_stats_vqc',
                    'title': 'Bcftools Stats: Variant Quality Count',
                    'ylab': 'Count',
                    'xlab': 'Quality',
                    'xDecimals': False,
                    'ymin': 0,
                    'smooth_points': 600,
                    # 'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
                    'data_labels': [
                        {'name': 'Count SNP', 'ylab': 'Quality'},
                        {'name': 'Count Transitions', 'ylab': 'Quality'},
                        {'name': 'Count Transversions', 'ylab': 'Quality'},
                        {'name': 'Count Indels', 'ylab': 'Quality'}
                    ]
                }
                self.add_section (
                    name = 'Variant Quality',
                    anchor = 'bcftools-stats_variant_quality_plot',
                    plot = linegraph.plot (
                        [self.bcftools_stats_vqc_snp,
                        self.bcftools_stats_vqc_transi,
                        self.bcftools_stats_vqc_transv,
                        self.bcftools_stats_vqc_indels], pconfig)
                )

            # Make line graph of indel lengths
            if len(self.bcftools_stats_indels) > 0:
                pconfig = {
                    'id': 'bcftools_stats_indel-lengths',
                    'title': 'Bcftools Stats: Indel Distribution',
                    'ylab': 'Count',
                    'xlab': 'InDel Length (bp)',
                    'xDecimals': False,
                    'ymin': 0,
                }
                self.add_section (
                    name = 'Indel Distribution',
                    anchor = 'bcftools-stats_indel_plot',
                    plot = linegraph.plot(self.bcftools_stats_indels, pconfig)
                )
            # Make line graph of variants per depth
            if len(depth_data) > 0:
                pconfig = {
                    'id': 'bcftools_stats_depth',
                    'title': 'Bcftools Stats: Variant depths',
                    'ylab': 'Fraction of sites (%)',
                    'xlab': 'Variant depth',
                    'ymin': 0,
                    'ymax': 100,
                    'categories': True
                }
                self.add_section (
                    name = 'Variant depths',
                    anchor = 'bcftools-stats_depth_plot',
                    description = 'Read depth support distribution for called variants',
                    plot = linegraph.plot(depth_data, pconfig)
                )

        # Return the number of logs that were found
        return len(self.bcftools_stats)
Ejemplo n.º 44
0
    def gather_phasing_stats(self):
        # Create headers
        headers = OrderedDict()
        headers['switch rate'] = {
            'title': 'Switch rate',
            'description': 'switch errors as a fraction of possible positions for switch errors',
            'format': '{:,.7f}',
            'placement': 1
            }

        headers['mismatch rate'] = {
            'title': 'Mismatch rate',
            'description': 'mismatch errors as a fraction of possible positions for mismatch errors',
            'format': '{:,.7f}',
            'placement': 2
        }

        headers['flat rate'] = {
            'title': 'Flat rate',
            'description': 'flat errors as a fraction of possible positions for flat errors',
            'format': '{:,.7f}',
            'hidden': True,
        }

        headers['phased count'] = {
            'title': 'Phased count',
            'description': 'count of total SNVs phased in the test haplotype',
            'format': '{:,.0f}',
            'placement': 3
        }

        headers['AN50'] = {
            'title': 'AN50 (Mbp)',
            'description': 'the AN50 metric of haplotype completeness',
            'format': '{:,.3f}',
            'hidden': True
        }

        headers['N50'] = {
            'title': 'N50 (Mbp)',
            'description': 'the N50 metric of haplotype completeness',
            'format': '{:,.3f}',
            'placement': 4
        }

        headers['num snps max blk'] = {
            'title': 'SNPs in max blk',
            'description': 'the fraction of SNVs in the largest (most variants phased) block',
            'format': '{:,.0f}',
            'placement': 5
        }

        # Find and load any input files for this module
        phasing_data = dict()
        for f in self.find_log_files('hapcut2/phasing_stats', filehandles=True):
            sample_name = update_sample_name(f["s_name"])
            phasing_data[sample_name] = dict()

            for parameter, value in self.parse_phasing_stats(f["f"]):
                phasing_data[sample_name][parameter] = value

        if len(phasing_data) > 0:
            # Write parsed report data to a file
            self.write_data_file(phasing_data, "hapcut2_phasing_stats")

            pconfig = {
                'id': 'hapcut2_phasing_stats_table',
                'title': "HapCUT2 phasing stats",
                'scale': False,
                'share_key': False
            }
            table_html = table.plot(phasing_data, headers, pconfig)

            # Add a report section with table
            self.add_section(
                name="HapCUT2 phasing stats",
                description="Statistics table",
                helptext='''
                Description of statistics (taken from https://github.com/vibansal/HapCUT2/tree/master/utilities):
                ''',
                plot=table_html
            )

        return len(phasing_data)
Ejemplo n.º 45
0
    def parse_plotCoverage(self):
        """Find plotCoverage output. Both stdout and --outRawCounts"""
        self.deeptools_plotCoverageStdout = dict()
        for f in self.find_log_files('deeptools/plotCoverageStdout'):
            parsed_data = self.parsePlotCoverageStdout(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotCoverageStdout:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotCoverageStdout[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='plotCoverage')

        self.deeptools_plotCoverageOutRawCounts= dict()
        for f in self.find_log_files('deeptools/plotCoverageOutRawCounts'):
            parsed_data = self.parsePlotCoverageOutRawCounts(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_plotCoverageOutRawCounts:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_plotCoverageOutRawCounts[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='plotCoverage')

        if len(self.deeptools_plotCoverageStdout) > 0:
            header = OrderedDict()
            header["min"] = {
                'title': 'Min',
                'description': 'Minimum Coverage',
                'shared_key': 'coverage'
            }
            header["25%"] = {
                'rid': 'first_quartile',
                'title': '1st Quartile',
                'description': 'First quartile coverage',
                'shared_key': 'coverage'
            }
            header["50%"] = {
                'rid': 'median',
                'title': 'Median',
                'description': 'Median coverage (second quartile)',
                'shared_key': 'coverage'
            }
            header["mean"] = {
                'title': 'Mean',
                'description': 'Mean coverage',
                'shared_key': 'coverage'
            }
            header["75%"] = {
                'rid': 'third_quartile',
                'title': '3rd Quartile',
                'description': 'Third quartile coverage',
                'shared_key': 'coverage'
            }
            header["max"] = {
                'title': 'Max',
                'description': 'Maximum coverage',
                'shared_key': 'coverage'
            }
            header["std"] = {
                'title': 'Std. Dev.',
                'description': 'Coverage standard deviation',
                'shared_key': 'coverage'
            }
            config = {'namespace': 'deepTools plotCoverage'}
            self.add_section(
                name = "Coverage metrics",
                anchor = "deeptools_coverage_metrics",
                plot = table.plot(self.deeptools_plotCoverageStdout, header, config)
            )

        if len(self.deeptools_plotCoverageOutRawCounts) > 0:
            config = {
                'id': 'deeptools_coverage_metrics_plot',
                'title': 'Coverage distribution',
                'xlab': 'Coverage',
                'ylab': 'Fraction of bases sampled'
            }
            self.add_section(
                name = "Coverage distribution",
                anchor = "deeptools_coverage_distribution",
                description = "The fraction of bases with a given number of read/fragment coverage",
                plot = linegraph.plot(self.deeptools_plotCoverageOutRawCounts, config)
            )

        return len(self.deeptools_plotCoverageStdout), len(self.deeptools_plotCoverageOutRawCounts)
Ejemplo n.º 46
0
    def bustools_section(self):
        """Add bargraphs showing the mean UMIs per barcode and percentages in whitelist"""
        # add the summary table
        tconfig = {
            "namespace": "Bustools",
            "id": "bustools_summary",
            "table_title": "Bustools Summary Table"
        }
        self.add_section(
            name="Summary table",
            anchor="bustools-inspect",
            description=
            "This is a table of the complete output of bustools inspect. Note that some columns are hidden by default (click <em>Configure Columns</em> to show).",
            plot=table.plot(self.bustools_data, self.headers, tconfig),
        )

        # also make some nice barplots
        # barplot for mean umis per sample
        mean_umis = {
            sample: {
                "UMIs per barcode": values["meanUMIsPerBarcode"]
            }
            for sample, values in self.bustools_data.items()
        }

        self.add_section(
            name="Mean number of UMIs per barcode",
            anchor="bustools-umis",
            description=
            "Average number of UMIs (unique molecular identifiers) per barcode",
            helptext=
            "Each unique barcode represents a cell and each Unique Molecular Identifier (UMI) represents "
            "a unique transcript molecule. By counting the mean number of UMIs per barcode, you "
            "effectively calculate the average number of unique transcripts per cell.",
            plot=bargraph.plot(
                mean_umis,
                pconfig={
                    "id": "bus_umis",
                    "title":
                    "Bustools: Mean number of UMIs per barcode per sample",
                    "cpswitch": False,
                    "tt_percentages": False,
                    "ylab": "Mean UMIs per barcode",
                },
            ),
        )

        # barplot for the percentage of reads and barcodes on the whitelist
        percentage_whitelist = {
            sample: {
                "Reads on whitelist": values["percentageReadsOnWhitelist"],
                "Barcodes on whitelist":
                values["percentageBarcodesOnWhitelist"],
            }
            for sample, values in self.bustools_data.items()
        }
        self.add_section(
            name="Percentage in whitelist",
            anchor="bustools-reads",
            description=
            "The whitelist is a list of unique barcodes used in your protocol, either provided or inferred from the data.",
            helptext=
            "Each unique barcode from the whitelist represents a cell. The percentage of "
            "reads with barcode / barcodes in the whitelist is a measure of percentage of reads that could "
            "be asigned to a cell.",
            plot=bargraph.plot(
                percentage_whitelist,
                pconfig={
                    "id": "bus_reads",
                    "title":
                    "Bustools: Barcodes / reads with barcodes in the whitelist",
                    "ymax": 100,
                    "ymix": 0,
                    "cpswitch": False,
                    "tt_percentages": False,
                    "ylab":
                    "Percentage of barcodes / reads with barcodes in the whitelist",
                    "stacking": None,
                    "ylab_format": "{value}%",
                },
            ),
        )
Ejemplo n.º 47
0
    def __init__(self):
        super(MultiqcModule, self).__init__(name='Stacks', anchor='stacks',
        href="http://catchenlab.life.illinois.edu/stacks/",
        info="A software for analyzing restriction enzyme-based data (e.g. RAD-seq).")

        self.gsheaders = OrderedDict()
        self.gsheaders['n_loci'] = {
                'title': '# loci',
                'description': 'Number of loci built',
                'format': '{:,.i}',
                'scale': 'RdYlGn'
        }
        self.gsheaders['n_used_fw_reads'] = {
                'title': 'K reads used',
                'modify': lambda x: float(x) / 1000.0,
                'description': 'Number of thousand reads used',
                'scale': 'BuGn'
        }
        self.gsheaders['mean_cov'] = {
                'title': 'cov',
                'suffix': 'X',
                'description': 'Mean sequence coverage at locus',
                'scale': 'BuPu',
        }
        self.gsheaders['mean_cov_ns'] = {
                'title': 'weighted cov',
                'suffix': 'X',
                'description': 'The coverage at each locus is weighted by the number of samples present at that locus (i.e. coverage at shared loci counts more)',
                'scale': 'YlGn',
        }

        self.sheaders = OrderedDict()
        self.sheaders['# Pop ID'] = {
                'title': 'PopID',
                'description': 'Population ID as defined in the Population Map file.',
                'scale': False,
                'format': '{:,.s}'
        }
        self.sheaders['Private'] = {
                'title': 'Private',
                'description': 'Number of private alleles in this population.',
                'scale': 'PuBu',
                'hidden': True
        }
        self.sheaders['Num_Indv'] = {
                'title': '# Indv',
                'description': 'Mean number of individuals per locus in this population.',
                'scale': 'YlGn'
        }
        self.sheaders['P'] = {
                'title': 'P',
                'description': 'Mean frequency of the most frequent allele at each locus in this population.',
                'scale': 'PuBu',
                'min': 0,
                'max': 1
        }
        self.sheaders['Obs_Het'] = {
                'title': 'Obs Het',
                'description': 'Mean observed heterozygosity in this population.',
                'scale': 'YlGn',
                'min': 0,
                'max': 1,
        }
        self.sheaders['Obs_Hom'] = {
                'title': 'Obs Hom',
                'description': 'Mean observed homozygosity in this population.',
                'scale': 'PuBu',
                'min': 0,
                'max': 1,
                'hidden': True
        }
        self.sheaders['Exp_Hom'] = {
                'title': 'Exp_Hom',
                'description': 'Mean expected homozygosity in this population.',
                'scale': 'YlGn',
                'min': 0,
                'max': 1,
                'hidden': True
        }
        self.sheaders['Exp_Het'] = {
                'title': 'Exp Het',
                'description': 'Mean expected heterozygosity in this population.',
                'scale': 'PuBu',
                'min': 0,
                'max': 1
        }
        self.sheaders['Pi'] = {
                'title': 'Pi',
                'description': 'Mean value of &#960; in this population.',
                'scale': 'YlGn',
                'min': 0,
                'max': 1
        }
        self.sheaders['Fis'] = {
                'title': 'Fis',
                'description': 'Mean measure of Fis in this population.',
                'scale': 'PuOr',
                'min': -1,
                'max': 1
        }

        num_files = 0
        # Parse gstacks data
        self.cov_data = OrderedDict()
        for f in self.find_log_files('stacks/gstacks'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            try:
                self.cov_data.update(self.parse_gstacks(f['f'], s_name))
                num_files += 1
            except:
                log.error('Could not parse gstacks.distribs file in {}'.format(f['s_name']))

        # Parse populations data
        self.distribs_loci = OrderedDict()
        self.distribs_snps = OrderedDict()
        for f in self.find_log_files('stacks/populations'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            i,j = self.parse_populations(f['f'], s_name)
            try:
                self.distribs_loci.update(i); self.distribs_snps.update(j)
                num_files += 1
            except:
                log.error('Could not parse population.log.distribs file in {}'.format(f['s_name']))

        # Parse sumstats file
        self.sumstats_data = OrderedDict()
        for f in self.find_log_files('stacks/sumstats'):
            run_name = os.path.dirname(f['root'])
            s_name = self.clean_s_name(os.path.basename(f['root']), run_name)
            try:
                self.sumstats_data.update(self.parse_sumstats(f['f'], s_name))
                num_files += 1
            except:
                log.error('Could not parse populations.sumstats_summary file in {}'.format(f['s_name']))


        # Ignore samples
        self.cov_data = self.ignore_samples(self.cov_data)
        self.distribs_loci = self.ignore_samples(self.distribs_loci)
        self.distribs_snps = self.ignore_samples(self.distribs_snps)
        self.sumstats_data = self.ignore_samples(self.sumstats_data)

        if len(self.cov_data) == 0 and len(self.sumstats_data) == 0 and len(self.distribs_loci) == 0:
            raise UserWarning
        log.info("Found {} reports".format(num_files))

        # Write parsed report data to a file
        self.write_data_file(self.cov_data, 'multiqc_stacks_cov')
        self.write_data_file(self.sumstats_data, 'multiqc_stacks_sumstats')

        ### Write the sample table
        config_table = {
            'id': 'gstacks_table',
            'namespace': 'stacks'
        }
        self.add_section (
            name = 'Sample statistics',
            anchor = 'stacks-gstacks',
            description = 'The sample specific statistics for Stacks',
            helptext = '''**Note!** The sample names have the following scheme `<run folder name> | <input fastq file prefix>`.
                        This data is obtained from the gstacks program run after builing sample and catalog loci merge
                        paired-ends and call variants.
                        These numbers are obtained from the `gstacks.log.distribs` file''',
            plot = table.plot(self.cov_data, self.gsheaders, config_table)
        )
        # Write population sumstats table
        config_table = {
            'id': 'sumstats_table',
            'namespace': 'stacks'
        }
        self.add_section (
            name = 'Population summary statistics',
            anchor = 'stacks-sumstats',
            description = 'Population statistics as calculated from variant sites found in this run',
            helptext = '''**Note!** The sample names have the following scheme `<run folder name> | <population ID>`,
                        where the population ID is defined in the input population map file.
                        This information is obtained from the Stacks program `population` and the file populations.sumstats_summary.tsv
                        ''',
            plot = table.plot(self.sumstats_data, self.sheaders, config_table)
        )
        config_distribs = {
            'id': 'distribs_plot',
            'namespace': 'stacks',
            'tt_label': '{point.y} loci, {point.x} samples/SNPs',
            'data_labels': [
                {'name': 'Samples per loci', 'ylab': '# loci', 'xlab': '# samples'},
                {'name': 'SNPs per loci', 'ylab': '# loci', 'xlab': '# SNPs'}
            ]
        }
        self.add_section (
            name = 'Population plots',
            anchor = 'stacks-distribs',
            description = 'Plots showing, 1) the number of loci shared by number of samples and 2) the number of SNPs per sample',
            helptext = '''The distributions are obtained from the Stacks program `populations` and it's output file `populations.log.distribs`.
            These numbers are Stacks' post-filtering.''',
            plot = linegraph.plot([self.distribs_loci, self.distribs_snps], config_distribs)

        )
    def parse_estimateReadFiltering(self):
        """Find estimateReadFiltering output. Only the output from --table is supported."""
        self.deeptools_estimateReadFiltering = dict()
        for f in self.find_log_files('deeptools/estimateReadFiltering'):
            parsed_data = self.parseEstimateReadFilteringFile(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_estimateReadFiltering:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_estimateReadFiltering[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='estimateReadFiltering')

        if len(self.deeptools_estimateReadFiltering) > 0:
            header = OrderedDict()
            header["M Entries"] = {
                'title': 'M entries',
                'description': 'Number of entries in the file (millions)'
            }
            header["pct_Aligned"] = {
                'title': '% Aligned',
                'description': 'Percent of aligned entries',
                'scale': 'YlGn',
                'min': 0,
                'max': 100
            }
            header["pct_Filtered"] = {
                'title': '% Tot. Filtered',
                'description':
                'Percent of alignment that would be filtered for any reason.',
                'scale': 'OrRd',
                'min': 0,
                'max': 100
            }
            header["pct_Blacklisted"] = {
                'title': '% Blacklisted',
                'description':
                'Percent of alignments falling (at least partially) inside a blacklisted region',
                'scale': 'YlOrRd',
                'min': 0,
                'max': 100
            }
            header["pct_MAPQ"] = {
                'title': '% MAPQ',
                'description':
                'Percent of alignments having MAPQ scores below the specified threshold',
                'scale': 'YlOrBn',
                'min': 0,
                'max': 100
            }
            header["pct_Missing_Flags"] = {
                'title': '% Missing Flags',
                'description':
                'Percent of alignments lacking at least on flag specified by --samFlagInclude',
                'scale': 'PuRd',
                'min': 0,
                'max': 100
            }
            header["pct_Forbidden_Flags"] = {
                'title': '% Forbidden Flags',
                'description':
                'Percent of alignments having at least one flag specified by --samFlagExclude',
                'scale': 'OrRd',
                'min': 0,
                'max': 100
            }
            header["pct_deepTools_Dupes"] = {
                'title': '% deepTools Dupes',
                'description':
                'Percent of alignments marked by deepTools as being duplicates',
                'scale': 'PuRd',
                'min': 0,
                'max': 100
            }
            header["pct_Duplication"] = {
                'title': '% Duplication',
                'description':
                'Percent of alignments originally marked as being duplicates',
                'scale': 'OrRd',
                'min': 0,
                'max': 100
            }
            header["pct_Singletons"] = {
                'title': '% Singletons',
                'description':
                'Percent of alignments that are singletons (i.e., paired-end reads where the mates don\'t align as a pair',
                'scale': 'OrRd',
                'min': 0,
                'max': 100
            }
            header["pct_Strand_Filtered"] = {
                'title': '% Strand Filtered',
                'description':
                'Percent of alignments arising from the wrong strand',
                'scale': 'OrRd',
                'min': 0,
                'max': 100
            }

            tdata = dict()
            for k, v in self.deeptools_estimateReadFiltering.items():
                tdata[k] = {
                    'M Entries':
                    v['total'] / 1000000.0,
                    'pct_Aligned':
                    100. * v['mapped'] / float(v['total']),
                    'pct_Filtered':
                    100. * v['filtered'] / float(v['total']),
                    'pct_Blacklisted':
                    100. * v['blacklisted'] / float(v['total']),
                    'pct_Below_MAPQ':
                    100. * v['mapq'] / float(v['total']),
                    'pct_Missing_Flags':
                    100. * v['required flags'] / float(v['total']),
                    'pct_Forbidden_Flags':
                    100. * v['excluded flags'] / float(v['total']),
                    'pct_deepTools_Dupes':
                    100. * v['internal dupes'] / float(v['total']),
                    'pct_Duplication':
                    100. * v['dupes'] / float(v['total']),
                    'pct_Singletons':
                    100. * v['singletons'] / float(v['total']),
                    'pct_Strand_Filtered':
                    100. * v['strand'] / float(v['total'])
                }

            config = {'namespace': 'deepTools bamPEFragmentSize'}
            self.add_section(
                name="Filtering metrics",
                anchor="estimateReadFiltering",
                description=
                "Estimated percentages of alignments filtered independently for each setting in `estimateReadFiltering`",
                plot=table.plot(tdata, header, config))

        return len(self.deeptools_estimateReadFiltering)
Ejemplo n.º 49
0
    def slamdunkFilterStatsTable(self):
        """ Take the parsed filter stats from Slamdunk and add it to a separate table """

        headers = OrderedDict()
        headers['mapped'] = {
            'namespace': 'Slamdunk',
            'title': '{} Mapped'.format(config.read_count_prefix),
            'description': '# mapped reads ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
            'min': 0,
            'format': '{:,.2f}',
            'suffix': config.read_count_prefix,
            'scale': 'YlGn',
            'modify': lambda x: float(x) * config.read_count_multiplier,
        }
        headers['multimapper'] = {
            'namespace': 'Slamdunk',
            'title': '{} Multimap-Filtered'.format(config.read_count_prefix),
            'description': '# multimap-filtered reads ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
            'min': 0,
            'format': '{:,.2f}',
            'suffix': config.read_count_prefix,
            'scale': 'OrRd',
            'modify': lambda x: float(x) * config.read_count_multiplier,
        }
        headers['nmfiltered'] = {
            'namespace': 'Slamdunk',
            'title': '{} NM-Filtered'.format(config.read_count_prefix),
            'description': '# NM-filtered reads ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
            'min': 0,
            'format': '{:,.2f}',
            'suffix': config.read_count_prefix,
            'scale': 'OrRd',
            'modify': lambda x: float(x) * config.read_count_multiplier,
        }
        headers['idfiltered'] = {
            'namespace': 'Slamdunk',
            'title': '{} Identity-Filtered'.format(config.read_count_prefix),
            'description': '# identity-filtered reads ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
            'min': 0,
            'format': '{:,.2f}',
            'suffix': config.read_count_prefix,
            'scale': 'OrRd',
            'modify': lambda x: float(x) * config.read_count_multiplier,
        }
        headers['mqfiltered'] = {
            'namespace': 'Slamdunk',
            'title': '{} MQ-Filtered'.format(config.read_count_prefix),
            'description': '# MQ-filtered reads ({})'.format(config.read_count_desc),
            'shared_key': 'read_count',
            'min': 0,
            'format': '{:,.2f}',
            'suffix': config.read_count_prefix,
            'scale': 'OrRd',
            'modify': lambda x: float(x) * config.read_count_multiplier,
        }
        pconfig = {
            'id': 'slamdunk_filtering_table',
            'min': 0,
        }

        self.add_section (
            name = 'Filter statistics',
            anchor = 'slamdunk_filtering',
            description = 'This table shows the number of reads filtered with each filter criterion during filtering phase of slamdunk.',
            plot = table.plot(self.slamdunk_data, headers, pconfig)
        )
Ejemplo n.º 50
0
    def parse_estimateReadFiltering(self):
        """Find estimateReadFiltering output. Only the output from --table is supported."""
        self.deeptools_estimateReadFiltering = dict()
        for f in self.find_log_files("deeptools/estimateReadFiltering"):
            parsed_data = self.parseEstimateReadFilteringFile(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_estimateReadFiltering:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_estimateReadFiltering[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section="estimateReadFiltering")

        self.deeptools_estimateReadFiltering = self.ignore_samples(
            self.deeptools_estimateReadFiltering)

        if len(self.deeptools_estimateReadFiltering) > 0:
            # Write data to file
            self.write_data_file(self.deeptools_estimateReadFiltering,
                                 "deeptools_read_filtering")

            header = OrderedDict()
            header["M Entries"] = {
                "title": "M entries",
                "description": "Number of entries in the file (millions)"
            }
            header["pct_Aligned"] = {
                "title": "% Aligned",
                "description": "Percent of aligned entries",
                "scale": "YlGn",
                "min": 0,
                "max": 100,
            }
            header["pct_Filtered"] = {
                "title": "% Tot. Filtered",
                "description":
                "Percent of alignment that would be filtered for any reason.",
                "scale": "OrRd",
                "min": 0,
                "max": 100,
            }
            header["pct_Blacklisted"] = {
                "title": "% Blacklisted",
                "description":
                "Percent of alignments falling (at least partially) inside a blacklisted region",
                "scale": "YlOrRd",
                "min": 0,
                "max": 100,
            }
            header["pct_MAPQ"] = {
                "title": "% MAPQ",
                "description":
                "Percent of alignments having MAPQ scores below the specified threshold",
                "scale": "YlOrBn",
                "min": 0,
                "max": 100,
            }
            header["pct_Missing_Flags"] = {
                "title": "% Missing Flags",
                "description":
                "Percent of alignments lacking at least on flag specified by --samFlagInclude",
                "scale": "PuRd",
                "min": 0,
                "max": 100,
            }
            header["pct_Forbidden_Flags"] = {
                "title": "% Forbidden Flags",
                "description":
                "Percent of alignments having at least one flag specified by --samFlagExclude",
                "scale": "OrRd",
                "min": 0,
                "max": 100,
            }
            header["pct_deepTools_Dupes"] = {
                "title": "% deepTools Dupes",
                "description":
                "Percent of alignments marked by deepTools as being duplicates",
                "scale": "PuRd",
                "min": 0,
                "max": 100,
            }
            header["pct_Duplication"] = {
                "title": "% Duplication",
                "description":
                "Percent of alignments originally marked as being duplicates",
                "scale": "OrRd",
                "min": 0,
                "max": 100,
            }
            header["pct_Singletons"] = {
                "title": "% Singletons",
                "description":
                "Percent of alignments that are singletons (i.e., paired-end reads where the mates don't align as a pair",
                "scale": "OrRd",
                "min": 0,
                "max": 100,
            }
            header["pct_Strand_Filtered"] = {
                "title": "% Strand Filtered",
                "description":
                "Percent of alignments arising from the wrong strand",
                "scale": "OrRd",
                "min": 0,
                "max": 100,
            }

            tdata = dict()
            for k, v in self.deeptools_estimateReadFiltering.items():
                tdata[k] = {
                    "M Entries":
                    v["total"] / 1000000.0,
                    "pct_Aligned":
                    100.0 * v["mapped"] / float(v["total"]),
                    "pct_Filtered":
                    100.0 * v["filtered"] / float(v["total"]),
                    "pct_Blacklisted":
                    100.0 * v["blacklisted"] / float(v["total"]),
                    "pct_Below_MAPQ":
                    100.0 * v["mapq"] / float(v["total"]),
                    "pct_Missing_Flags":
                    100.0 * v["required flags"] / float(v["total"]),
                    "pct_Forbidden_Flags":
                    100.0 * v["excluded flags"] / float(v["total"]),
                    "pct_deepTools_Dupes":
                    100.0 * v["internal dupes"] / float(v["total"]),
                    "pct_Duplication":
                    100.0 * v["dupes"] / float(v["total"]),
                    "pct_Singletons":
                    100.0 * v["singletons"] / float(v["total"]),
                    "pct_Strand_Filtered":
                    100.0 * v["strand"] / float(v["total"]),
                }

            config = {"namespace": "deepTools bamPEFragmentSize"}
            self.add_section(
                name="Filtering metrics",
                anchor="estimateReadFiltering",
                description=
                "Estimated percentages of alignments filtered independently for each setting in `estimateReadFiltering`",
                plot=table.plot(tdata, header, config),
            )

        return len(self.deeptools_estimateReadFiltering)
Ejemplo n.º 51
0
    def quast_table(self):
        """ Write some more statistics about the assemblies in a table. """
        headers = OrderedDict()

        headers['N50'] = {
            'title': 'N50 ({})'.format(self.contig_length_suffix),
            'description':
            'N50 is the contig length such that using longer or equal length contigs produces half (50%) of the bases of the assembly.',
            'min': 0,
            'suffix': self.contig_length_suffix,
            'scale': 'RdYlGn',
            'modify': lambda x: x * self.contig_length_multiplier
        }

        headers['N75'] = {
            'title': 'N75 ({})'.format(self.contig_length_suffix),
            'description':
            'N75 is the contig length such that using longer or equal length contigs produces 75% of the bases of the assembly',
            'min': 0,
            'suffix': self.contig_length_suffix,
            'scale': 'RdYlGn',
            'modify': lambda x: x * self.contig_length_multiplier
        }

        headers['L50'] = {
            'title':
            'L50 ({})'.format(self.total_number_contigs_suffix)
            if self.total_number_contigs_suffix else 'L50',
            'description':
            'L50 is the number of contigs larger than N50, i.e. the minimum number of contigs comprising 50% of the total assembly length.',
            'min':
            0,
            'suffix':
            self.total_number_contigs_suffix,
            'scale':
            'GnYlRd',
            'modify':
            lambda x: x * self.total_number_contigs_multiplier
        }

        headers['L75'] = {
            'title':
            'L75 ({})'.format(self.total_number_contigs_suffix)
            if self.total_number_contigs_suffix else 'L75',
            'description':
            'L75 is the number of contigs larger than N75, i.e. the minimum number of contigs comprising 75% of the total assembly length.',
            'min':
            0,
            'suffix':
            self.total_number_contigs_suffix,
            'scale':
            'GnYlRd',
            'mofidy':
            lambda x: x * self.total_number_contigs_multiplier
        }
        headers['Largest contig'] = {
            'title': 'Largest contig ({})'.format(self.contig_length_suffix),
            'description': 'The size of the largest contig of the assembly',
            'min': 0,
            'suffix': self.contig_length_suffix,
            'scale': 'YlGn',
            'modify': lambda x: x * self.contig_length_multiplier
        }

        headers['Total length'] = {
            'title': 'Length ({})'.format(self.total_length_suffix),
            'description': 'The total number of bases in the assembly.',
            'min': 0,
            'suffix': self.total_length_suffix,
            'scale': 'YlGn',
            'modify': lambda x: x * self.total_length_multiplier
        }

        headers['# misassemblies'] = {
            'title': 'Misassemblies',
            'description':
            'The number of positions in the assembled contigs where the left flanking sequence aligns over 1 kbp away from the right flanking sequence on the reference (relocation) or they overlap on more than 1 kbp (relocation) or flanking sequences align on different strands (inversion) or different chromosomes (translocation).',
            'scale': 'RdYlGn-rev',
            'format': '{,:.0f}'
        }
        headers['# mismatches per 100 kbp'] = {
            'title': 'Mismatches/100kbp',
            'description': 'The number of mismatches per 100 kbp',
            'scale': 'YlOrRd',
            'format': '{:,.2f}',
        }
        headers['# indels per 100 kbp'] = {
            'title': 'Indels/100kbp',
            'description': 'The number of indels per 100 kbp',
            'scale': 'YlOrRd',
            'format': '{:,.2f}',
        }
        headers['# genes'] = {
            'title': 'Genes',
            'description': '# Genes',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['# genes_partial'] = {
            'title': 'Genes (Partial)',
            'description': '# Genes (Partial)',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['# predicted genes (unique)'] = {
            'title': 'Genes',
            'description': '# Predicted Genes (Unique)',
            'scale': 'YlGnBu',
            'format': '{:,.0f}',
            'shared_key': 'gene_count'
        }
        headers['Genome fraction (%)'] = {
            'title': 'Genome Fraction',
            'description':
            'The total number of aligned bases in the reference, divided by the genome size.',
            'max': 100,
            'suffix': '%',
            'scale': 'YlGn'
        }
        config = {
            'id': 'quast_table',
            'namespace': 'QUAST',
            'min': 0,
        }
        return table.plot(self.quast_data, headers, config)
Ejemplo n.º 52
0
    def verifybamid_table(self):
        """
        Create a table with all the columns from verify BAM ID
        """

        # create an ordered dictionary to preserve the order of columns
        headers = OrderedDict()
        # add each column and the title and description (taken from verifyBAMID website)
        headers["RG"] = {
            "title": "Read Group",
            "description": "ReadGroup ID of sequenced lane.",
            "hidden": all([s["RG"] == "ALL" for s in self.verifybamid_data.values()]),
        }
        if not self.hide_chip_columns:
            headers["CHIP_ID"] = {"title": "Chip ID", "description": "ReadGroup ID of sequenced lane."}
        headers["#SNPS"] = {
            "title": "SNPS",
            "description": "# SNPs passing the criteria from the VCF file",
            "format": "{:,.0f}",
            "min": 0,
            "scale": "BuPu",
        }
        headers["#READS"] = {
            "title": "{} Reads".format(config.read_count_prefix),
            "description": "Number of reads loaded from the BAM file ({})".format(config.read_count_desc),
            "format": "{:,.1f}",
            "modify": lambda x: x * config.read_count_multiplier if x != "NA" else x,
            "shared_key": "read_count",
            "min": 0,
            "scale": "GnBu",
        }
        headers["AVG_DP"] = {
            "title": "Average Depth",
            "description": "Average sequencing depth at the sites in the VCF file",
            "suffix": " X",
            "min": 0,
            "scale": "YlGn",
        }
        # use default columns
        headers["FREEMIX"] = dict(
            self.col_config_defaults,
            **{
                "title": "Contamination (Seq)",
                "description": "VerifyBamID: FREEMIX -   Sequence-only estimate of contamination.",
            },
        )
        headers["FREELK1"] = {
            "title": "FREEELK1",
            "format": "{:,.0f}",
            "description": "Maximum log-likelihood of the sequence reads given estimated contamination under sequence-only method",
            "min": 0,
            "scale": "RdYlGn",
        }
        headers["FREELK0"] = {
            "title": "FREELK0",
            "format": "{:,.0f}",
            "description": "Log-likelihood of the sequence reads given no contamination under sequence-only method",
            "min": 0,
            "scale": "RdYlGn",
        }
        headers["FREE_RH"] = {
            "title": "FREE_RH",
            "description": "Estimated reference bias parameter Pr(refBase|HET) (when --free-refBias or --free-full is used)",
            "hidden": all([s["FREE_RH"] == "NA" for s in self.verifybamid_data.values()]),
        }
        headers["FREE_RA"] = {
            "title": "FREE_RA",
            "description": "Estimated reference bias parameter Pr(refBase|HOMALT) (when --free-refBias or --free-full is used)",
            "hidden": all([s["FREE_RA"] == "NA" for s in self.verifybamid_data.values()]),
        }

        # Only print Chip columns to the report if we have data
        if not self.hide_chip_columns:
            headers["CHIPMIX"] = dict(
                self.col_config_defaults,
                **{
                    "title": "Contamination S+A",
                    "description": "VerifyBamID: CHIPMIX -   Sequence+array estimate of contamination (NA if the external genotype is unavailable)",
                },
            )
            headers["CHIPLK1"] = {
                "title": "CHIPLK1",
                "description": "Maximum log-likelihood of the sequence reads given estimated contamination under sequence+array method (NA if the external genotypes are unavailable)",
            }
            headers["CHIPLK0"] = {
                "title": "CHIPLK0",
                "description": " Log-likelihood of the sequence reads given no contamination under sequence+array method (NA if the external genotypes are unavailable)",
            }
            headers["CHIP_RH"] = {
                "title": "CHIP_RH",
                "description": "Estimated reference bias parameter Pr(refBase|HET) (when --chip-refBias or --chip-full is used)",
            }
            headers["CHIP_RA"] = {
                "title": "CHIP_RA",
                "description": "Estimated reference bias parameter Pr(refBase|HOMALT) (when --chip-refBias or --chip-full is used)",
            }

        headers["DPREF"] = {
            "title": "DPREF",
            "description": "Depth (Coverage) of HomRef site (based on the genotypes of (SELF_SM/BEST_SM), passing mapQ, baseQual, maxDepth thresholds.",
            "hidden": all([s["DPREF"] == "NA" for s in self.verifybamid_data.values()]),
        }
        headers["RDPHET"] = {
            "title": "RDPHET",
            "description": "DPHET/DPREF, Relative depth to HomRef site at Heterozygous site.",
            "hidden": all([s["RDPHET"] == "NA" for s in self.verifybamid_data.values()]),
        }
        headers["RDPALT"] = {
            "title": "RDPALT",
            "description": "DPHET/DPREF, Relative depth to HomRef site at HomAlt site.",
            "hidden": all([s["RDPALT"] == "NA" for s in self.verifybamid_data.values()]),
        }

        tconfig = {
            "namespace": "VerifyBAMID",
            "id": "verifybamid-results",
        }

        # send the plot to add section function with data dict and headers
        self.add_section(
            anchor="verifybamid-table",
            description="The following values provide estimates of sample contamination. Click help for more information.",
            helptext="""
            **Please note that `FREEMIX` is named _Contamination (Seq)_ and `CHIPMIX`
            is named _Contamination (S+A)_ in this MultiQC report.**

            VerifyBamID provides a series of information that is informative to determine
            whether the sample is possibly contaminated or swapped, but there is no single
            criteria that works for every circumstances. There are a few unmodeled factor
            in the estimation of `[SELF-IBD]/[BEST-IBD]` and `[%MIX]`, so please note that the
            MLE estimation may not always exactly match to the true amount of contamination.
            Here we provide a guideline to flag potentially contaminated/swapped samples:

            * Each sample or lane can be checked in this way.
              When `[CHIPMIX] >> 0.02` and/or `[FREEMIX] >> 0.02`, meaning 2% or more of
              non-reference bases are observed in reference sites, we recommend to examine
              the data more carefully for the possibility of contamination.
            * We recommend to check each lane for the possibility of sample swaps.
              When `[CHIPMIX] ~ 1` AND `[FREEMIX] ~ 0`, then it is possible that the sample
              is swapped with another sample. When `[CHIPMIX] ~ 0` in `.bestSM` file,
              `[CHIP_ID]` might be actually the swapped sample. Otherwise, the swapped
              sample may not exist in the genotype data you have compared.
            * When genotype data is not available but allele-frequency-based estimates of
              `[FREEMIX] >= 0.03` and `[FREELK1]-[FREELK0]` is large, then it is possible
              that the sample is contaminated with other sample. We recommend to use
              per-sample data rather than per-lane data for checking this for low coverage
              data, because the inference will be more confident when there are large number
              of bases with depth 2 or higher.

            _Copied from the [VerifyBAMID documentation](https://genome.sph.umich.edu/wiki/VerifyBamID) - see the link for more details._
            """,
            plot=table.plot(self.verifybamid_data, headers, tconfig),
        )
Ejemplo n.º 53
0
    def __init__(self):
        super(MultiqcModule, self).__init__(name='Supernova', anchor='supernova',
        href="https://www.10xgenomics.com/",
        info="is a de novo genome assembler 10X Genomics linked-reads.")

        # Headers for the supernova Table
        self.headers = OrderedDict()
        self.headers['Asm size'] = {
                'description': 'assembly size (in megabases) ;only scaffolds >= 10 kb',
                'modify': lambda x: x / 1000000.0,
                'suffix': 'Mb',
                'scale': 'YlGn'
        }
        self.headers['# Long scaffs'] = {
                'description': 'number of scaffolds >= 10 kb',
                'scale': 'YlGn',
                'format': '{:,.0f}',
        }
        self.headers['Scaff N50'] = {
                'description': 'N50 scaffold size (in kilobases)',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'RdYlGn'
        }
        self.headers['Phase N50'] = {
                'description': 'N50 phase block size (in kilobases)',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'RdYlGn',
                'hidden': True
        }
        self.headers['Contig N50'] = {
                'description': 'N50 contig size (in kilobases)',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'RdYlGn',
                'hidden': True
        }
        self.headers['Edge N50'] = {
                'description': 'N50 edge size (in kilobases)',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'RdYlGn',
                'hidden': True
        }
        self.headers['Mol size'] = {
                'description': 'weighted mean molecule size (in kilobases); ideal 50-100',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'BuGn'
        }
        self.headers['Read len'] = {
                'description': 'mean read length (in bases) after trimming; ideal 140',
                'suffix': 'b',
                'scale': 'PuBu',
                'format': '{:,.0f}',
                'hidden': True
        }
        self.headers['# Reads'] = {
                'description': 'number of reads (in millions); ideal 800M-1200M for human',
                'modify': lambda x: x / 1000000.0,
                'suffix': 'M',
                'scale': 'PuBu',
        }
        self.headers['Coverage'] = {
                'description': 'effective read coverage; ideal ~42 for nominal 56x cov',
                'suffix': 'x',
                'scale': 'PuBu'
        }
        self.headers['% Dup'] = {
                'description': 'fraction of reads that are duplicates',
                'suffix': '%',
                'scale': 'OrRd',
        }
        self.headers['% R2 Q30'] = {
                'description': 'fraction of Q30 bases in read 2; ideal 75-85%',
                'suffix': '%',
                'scale': 'OrRd',
        }
        self.headers['Insert size'] = {
                'description': 'median insert size (in bases); ideal 0.35-0.40 Kb',
                'suffix': 'b',
                'scale': 'OrRd',
                'format': '{:,.0f}',
                'hidden': True
        }
        self.headers['% proper'] = {
                'description': 'fraction of proper read pairs; ideal >= 75%',
                'suffix': '%',
                'scale': 'OrRd',
                'hidden': True
        }
        self.headers['Het dist'] = {
                'description': 'mean distance between heterozygous SNPs (in kilobases)',
                'modify': lambda x: x / 1000.0,
                'suffix': 'Kb',
                'scale': 'BuGn',
        }
        self.headers['% missing BC'] = {
                'description': 'fraction of reads that are not barcoded',
                'suffix': '%',
                'scale': 'BuGn',
        }
        self.headers['Barcode N50'] = {
                'description': 'N50 reads per barcode (in bases)',
                'suffix': 'b',
                'scale': 'BuGn',
                'format': '{:,.0f}',
        }
        self.headers['% Phased'] = {
                'description': 'nonduplicate and phased reads; ideal 45-50%',
                'suffix': '%',
                'scale': 'BuGn',
                'hidden': True
        }

        reports = OrderedDict()
        summaries = OrderedDict()
        molecules = OrderedDict()
        kmers = OrderedDict()
        root_summary = {}

        ### Parse the input log files
        # report.txt files
        for f in self.find_log_files('supernova/report'):
            log.debug("Found report in: {}".format(f['root']))
            sid, data = self.parse_report(f['f'])
            s_name = self.clean_s_name(sid, f['root'])
            if s_name in reports.keys():
                log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
            reports[s_name] = data
            self.add_data_source(f, s_name=s_name, section='supernova-table')

        # summary.json files
        for f in self.find_log_files('supernova/summary'):
            log.debug("Found summary.json in: {}".format(f['root']))
            try:
                sid, data = self.parse_summary(f['f'])
            except ValueError:
                log.debug("Error parsing JSON file in {}".format(f['root']))
                continue
            except RuntimeError:
                log.debug("Could not find sample_id in JSON file in {}".format(f['root']))
                continue

            s_name = self.clean_s_name(sid, f['root'])
            if s_name in summaries.keys():
                log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
            summaries[s_name] = data
            self.add_data_source(f, s_name=s_name, section='supernova-table')
            # The plot json files do not contain sample IDs, sadly. So we need to store it somewhere.
            root_summary[f['root']] = sid

        # histogram_molecules.json files
        for f in self.find_log_files('supernova/molecules'):
            log.debug("Found histogram_molecules.json in: {}".format(f['root']))
            try:
                if f['root'] in root_summary.keys():
                    data = self.parse_histogram(f['f'])
                    sid = root_summary[f['root']]
                    s_name = self.clean_s_name(sid, f['root'])
                    molecules[s_name] = data
                    self.add_data_source(f, s_name=s_name, section='supernova-molecules')
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f['root']))
                continue

        # histogram_kmer_count.json files
        for f in self.find_log_files('supernova/kmers'):
            log.debug("Found histogram_kmer_count.json in: {}".format(f['root']))
            try:
                if f['root'] in root_summary.keys():
                    data = self.parse_histogram(f['f'], 400)
                    sid = root_summary[f['root']]
                    s_name = self.clean_s_name(sid, f['root'])
                    kmers[s_name] = data
                    self.add_data_source(f, s_name=s_name, section='supernova-kmers')
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f['root']))
                continue

        # Data from summary.json supersedes data from report.txt
        for sample_id, sum_data in summaries.items():
            if sample_id in reports.keys():
                log.debug("Found summary data for sample {} which supersedes report data".format(sample_id))
                reports[sample_id] = sum_data
        # Ignore cmd-line specified samples
        reports = self.ignore_samples(reports)
        molecules = self.ignore_samples(molecules)
        kmers = self.ignore_samples(kmers)

        if len(reports) == 0:
            raise UserWarning
        else:
            log.info("Found {} reports".format(len(reports.keys())))

        ### Write the report
        self.write_data_file(reports, 'multiqc_supernova')

        config_table = {
            'id': 'supernova_table',
            'namespace': 'supernova'
        }
        self.add_section (
            name = 'Assembly statistics',
            anchor = 'supernova-table',
            description = 'Statistics gathered from the summary report(s) of Supernova. Note! ' \
                    'There are more columns available but they are hidden by default.',
            helptext = 'As a bare minimum these numbers are generated from the file report.txt, ' \
                    'found in the folder `sampleID/outs/`. If available the stats in the report ' \
                    'file will be superseded by the higher precision numbers found in the file ' \
                    '`sampleID/outs/assembly/stats/summary.json`',
            plot = table.plot(reports, self.headers, config_table)
        )

        # N50 barcharts
        n50_cats = [{'Scaff N50': {'name': 'Scaffold N50', 'color': '#66c2a5'}},
                {'Contig N50': {'name': 'Contig N50', 'color': '#fc8d62'}},
                {'Edge N50': {'name': 'Edge N50', 'color': '#8da0cb'}},
                {'Phase N50': {'name': 'Phase block N50', 'color': '#e78ac3'}}
        ]
        config_n50 = {
                'id': 'supernova_n50',
                'title': 'Supernova N50 statistics',
                'cpswitch': False,
                'data_labels': ['Scaffold N50', 'Contig N50', 'Edge N50', 'Phase block N50']
        }
        self.add_section (
            name = 'N50 statistics',
            anchor = 'supernova-n50',
            description = 'Assembly N50 values - the shortest sequence length at 50% of the genome when sorted by size (see [wikipedia](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50)).',
            helptext = "Note that assembly size and N50 values are computed after removing scaffolds &le; 10 kb and do not count `N`s: \n\n" \
                    "* **Scaffold N50** - N50 size of scaffolds in bases, \n" \
                    "* **Contig N50** - N50 size of contigs in bases, \n" \
                    "* **Edge N50** - N50 size of raw graph assembly edges in bases, \n" \
                    "* **Phase block N50** - N50 size of phase blocks in bases. \n\n" \
                    '[(source)](https://support.10xgenomics.com/de-novo-assembly/software/pipelines/latest/output/asm-stats)',
            plot = bargraph.plot([reports,reports,reports,reports], n50_cats, config_n50)
        )

        # Conditional sections
        if len(molecules) > 0:
            # Remove the long tail
            max_x = self.trim_tail(molecules, 100000)
            # Add molecules plot
            config_molecules = {
                'id': 'supernova_molecules',
                'title': 'Supernova Molecule Lengths',
                'xlab': 'Inferred molecule length (bp)',
                'ylab': '# molecules',
                'smooth_points': 300,
                'smooth_points_sumcounts': True,
                'xmax': max_x
            }
            self.add_section (
                name = 'Molecule Lengths',
                anchor = 'supernova-molecules',
                description = 'Shows the inferred molecule lengths of the input 10X library.',
                helptext = 'Inferred in the `patch` step of the Supernova pipeline. It is worth ' \
                        'keeping in mind that the mean molecule length from the report is a length-weighted mean. ' \
                        'See the [source code](https://github.com/10XGenomics/supernova/search?q=lw_mean_mol_len&type=) ' \
                        'for how this value is calculated.',
                plot = linegraph.plot(molecules, config_molecules)
            )
        if len(kmers) > 0:
            # Remove the long tail
            max_x = self.trim_tail(kmers, 50)

            # Add kmers plot
            config_kmers = {
                'id': 'supernova_kmers',
                'title': 'Supernova Kmer Counts',
                'xlab': 'Filtered kmer multiplicity',
                'ylab': 'Counts',
                'smooth_points_sumcounts': False,
                'xmax': max_x
            }
            self.add_section (
                name = 'K-mer counts',
                anchor = 'supernova-kmers',
                description = 'Shows the k-mer frequencies of the input data to Supernova (after filtering).',
                helptext = 'This data is generated from k-merizing the input read data, where the sequences are ' \
                        'transformed in to the set of all possible sub-sequences of a fixed length of `K` (Supernova uses `K=48`). ' \
                        'The plot shows on the x-axis the multiplicity (i.e. how many times are they repeated) of these k-mers ' \
                        'and the y-axis the number of k-mers at this level of multiplicity. ' \
                        'A careful reading of this plot can give some insights into the levels of heterozygosity and repeats ' \
                        'in the genome that was sequenced and indications if the sequencing experiment was successful.',
                plot = linegraph.plot(kmers, config_kmers)
            )
Ejemplo n.º 54
0
def parse_reports(self):
    """ Find Picard HsMetrics reports and parse their data """

    # Set up vars
    self.picard_HsMetrics_data = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('picard/hsmetrics', filehandles=True):
        parsed_data = dict()
        s_name = None
        keys = None
        commadecimal = None
        for l in f['f']:
            # New log starting
            if 'CalculateHsMetrics' in l or \
               'CollectHsMetrics' in l and 'INPUT' in l:
                s_name = None
                keys = None

                # Pull sample name from input
                fn_search = re.search(r"INPUT=(\[?[^\s]+\]?)", l)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1).strip('[]'))
                    s_name = self.clean_s_name(s_name, f['root'])
                    parsed_data[s_name] = dict()

            if s_name is not None:
                if 'picard.analysis.directed.HsMetrics' in l and '## METRICS CLASS' in l:
                    keys = f['f'].readline().strip("\n").split("\t")
                elif keys:
                    vals = l.strip("\n").split("\t")
                    if len(vals) == len(keys):
                        j = 'NA'
                        if keys[0] == 'BAIT_SET':
                            j = vals[0]
                        parsed_data[s_name][j] = dict()
                        # Check that we're not using commas for decimal places
                        if commadecimal is None:
                            for i, k in enumerate(keys):
                                if k.startswith('PCT_'):
                                    if ',' in vals[i]:
                                        commadecimal = True
                                    else:
                                        commadecimal = False
                        for i, k in enumerate(keys):
                            try:
                                if commadecimal:
                                    vals[i] = vals[i].replace('.', '')
                                    vals[i] = vals[i].replace(',', '.')
                                parsed_data[s_name][j][k] = float(vals[i])
                            except ValueError:
                                parsed_data[s_name][j][k] = vals[i]
                    else:
                        s_name = None
                        keys = None

        # Remove empty dictionaries
        for s_name in list(parsed_data.keys()):
            for j in parsed_data[s_name].keys():
                if len(parsed_data[s_name][j]) == 0:
                    parsed_data[s_name].pop(j, None)
            if len(parsed_data[s_name]) == 0:
                parsed_data.pop(s_name, None)

        # Manipulate sample names if multiple baits found
        for s_name in parsed_data.keys():
            for j in parsed_data[s_name].keys():
                this_s_name = s_name
                if (len(parsed_data[s_name]) > 1):
                    this_s_name = "{}: {}".format(s_name, j)
                if this_s_name in self.picard_HsMetrics_data:
                    log.debug(
                        "Duplicate sample name found in {}! Overwriting: {}".
                        format(f['fn'], this_s_name))
                self.add_data_source(f, this_s_name, section='HsMetrics')
                self.picard_HsMetrics_data[this_s_name] = parsed_data[s_name][
                    j]

    # Filter to strip out ignored sample names
    self.picard_HsMetrics_data = self.ignore_samples(
        self.picard_HsMetrics_data)

    if len(self.picard_HsMetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_HsMetrics_data,
                             'multiqc_picard_HsMetrics')

        # Add to general stats table
        # Swap question marks with -1
        data = self.picard_HsMetrics_data
        for s_name in data:
            if data[s_name]['FOLD_ENRICHMENT'] == '?':
                data[s_name]['FOLD_ENRICHMENT'] = -1

        self.general_stats_headers['FOLD_ENRICHMENT'] = {
            'title': 'Fold Enrichment',
            'min': 0,
            'format': '{:,.0f}',
            'scale': 'Blues',
        }
        try:
            covs = config.picard_config['general_stats_target_coverage']
            assert type(covs) == list
            assert len(covs) > 0
            covs = [str(i) for i in covs]
            log.debug("Custom Picard coverage thresholds: {}".format(", ".join(
                [i for i in covs])))
        except (AttributeError, TypeError, AssertionError):
            covs = ['30']
        for c in covs:
            self.general_stats_headers['PCT_TARGET_BASES_{}X'.format(c)] = {
                'id':
                'picard_target_bases_{}X'.format(c),
                'title':
                'Target Bases {}X'.format(c),
                'description':
                'Percent of target bases with coverage &ge; {}X'.format(c),
                'max':
                100,
                'min':
                0,
                'suffix':
                '%',
                'format':
                '{:,.0f}',
                'scale':
                'RdYlGn',
                'modify':
                lambda x: self.multiply_hundred(x)
            }
        for s_name in data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(data[s_name])
        data_table = _clean_table(data)
        self.add_section(name='HSMetrics',
                         anchor='picard_hsmetrics',
                         plot=table.plot(data_table, _get_headers(data_table)))
        tbases = _add_target_bases(data)
        self.add_section(name=tbases['name'],
                         anchor=tbases['anchor'],
                         description=tbases['description'],
                         plot=tbases['plot'])
        hs_pen = _add_hs_penalty(data)
        if hs_pen is not None:
            self.add_section(name=hs_pen['name'],
                             anchor=hs_pen['anchor'],
                             description=hs_pen['description'],
                             plot=hs_pen['plot'])

    # Return the number of detected samples to the parent module
    return len(self.picard_HsMetrics_data)
Ejemplo n.º 55
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name="Stacks",
            anchor="stacks",
            href="http://catchenlab.life.illinois.edu/stacks/",
            info=
            "A software for analyzing restriction enzyme-based data (e.g. RAD-seq).",
            doi="10.1111/mec.12354",
        )

        self.gsheaders = OrderedDict()
        self.gsheaders["n_loci"] = {
            "title": "# loci",
            "description": "Number of loci built",
            "format": "{:,.i}",
            "scale": "RdYlGn",
        }
        self.gsheaders["n_used_fw_reads"] = {
            "title": "K reads used",
            "modify": lambda x: float(x) / 1000.0,
            "description": "Number of thousand reads used",
            "scale": "BuGn",
        }
        self.gsheaders["mean_cov"] = {
            "title": "cov",
            "suffix": "X",
            "description": "Mean sequence coverage at locus",
            "scale": "BuPu",
        }
        self.gsheaders["mean_cov_ns"] = {
            "title": "weighted cov",
            "suffix": "X",
            "description":
            "The coverage at each locus is weighted by the number of samples present at that locus (i.e. coverage at shared loci counts more)",
            "scale": "YlGn",
        }

        self.sheaders = OrderedDict()
        self.sheaders["# Pop ID"] = {
            "title": "PopID",
            "description":
            "Population ID as defined in the Population Map file.",
            "scale": False,
            "format": "{:,.s}",
        }
        self.sheaders["Private"] = {
            "title": "Private",
            "description": "Number of private alleles in this population.",
            "scale": "PuBu",
            "hidden": True,
        }
        self.sheaders["Num_Indv"] = {
            "title": "# Indv",
            "description":
            "Mean number of individuals per locus in this population.",
            "scale": "YlGn",
        }
        self.sheaders["P"] = {
            "title": "P",
            "description":
            "Mean frequency of the most frequent allele at each locus in this population.",
            "scale": "PuBu",
            "min": 0,
            "max": 1,
        }
        self.sheaders["Obs_Het"] = {
            "title": "Obs Het",
            "description": "Mean observed heterozygosity in this population.",
            "scale": "YlGn",
            "min": 0,
            "max": 1,
        }
        self.sheaders["Obs_Hom"] = {
            "title": "Obs Hom",
            "description": "Mean observed homozygosity in this population.",
            "scale": "PuBu",
            "min": 0,
            "max": 1,
            "hidden": True,
        }
        self.sheaders["Exp_Hom"] = {
            "title": "Exp_Hom",
            "description": "Mean expected homozygosity in this population.",
            "scale": "YlGn",
            "min": 0,
            "max": 1,
            "hidden": True,
        }
        self.sheaders["Exp_Het"] = {
            "title": "Exp Het",
            "description": "Mean expected heterozygosity in this population.",
            "scale": "PuBu",
            "min": 0,
            "max": 1,
        }
        self.sheaders["Pi"] = {
            "title": "Pi",
            "description": "Mean value of &#960; in this population.",
            "scale": "YlGn",
            "min": 0,
            "max": 1,
        }
        self.sheaders["Fis"] = {
            "title": "Fis",
            "description": "Mean measure of Fis in this population.",
            "scale": "PuOr",
            "min": -1,
            "max": 1,
        }

        num_files = 0
        # Parse gstacks data
        self.cov_data = OrderedDict()
        for f in self.find_log_files("stacks/gstacks"):
            run_name = os.path.dirname(f["root"])
            s_name = self.clean_s_name(os.path.basename(f["root"]),
                                       f,
                                       root=run_name)
            try:
                self.cov_data.update(self.parse_gstacks(f["f"], s_name))
                self.add_data_source(f, section="gstacks")
                num_files += 1
            except:
                log.error("Could not parse gstacks.distribs file in {}".format(
                    f["s_name"]))

        # Parse populations data
        self.distribs_loci = OrderedDict()
        self.distribs_snps = OrderedDict()
        for f in self.find_log_files("stacks/populations"):
            run_name = os.path.dirname(f["root"])
            s_name = self.clean_s_name(os.path.basename(f["root"]),
                                       f,
                                       root=run_name)
            i, j = self.parse_populations(f["f"], s_name)
            try:
                self.distribs_loci.update(i)
                self.distribs_snps.update(j)
                self.add_data_source(f, section="populations")
                num_files += 1
            except:
                log.error("Could not parse population.log.distribs file in {}".
                          format(f["s_name"]))

        # Parse sumstats file
        self.sumstats_data = OrderedDict()
        for f in self.find_log_files("stacks/sumstats"):
            run_name = os.path.dirname(f["root"])
            s_name = self.clean_s_name(os.path.basename(f["root"]),
                                       f,
                                       root=run_name)
            try:
                self.sumstats_data.update(self.parse_sumstats(f["f"], s_name))
                self.add_data_source(f, section="sumstats")
                num_files += 1
            except:
                log.error(
                    "Could not parse populations.sumstats_summary file in {}".
                    format(f["s_name"]))

        # Ignore samples
        self.cov_data = self.ignore_samples(self.cov_data)
        self.distribs_loci = self.ignore_samples(self.distribs_loci)
        self.distribs_snps = self.ignore_samples(self.distribs_snps)
        self.sumstats_data = self.ignore_samples(self.sumstats_data)

        if len(self.cov_data) == 0 and len(self.sumstats_data) == 0 and len(
                self.distribs_loci) == 0:
            raise UserWarning
        log.info("Found {} reports".format(num_files))

        # Write parsed report data to a file
        self.write_data_file(self.cov_data, "multiqc_stacks_cov")
        self.write_data_file(self.sumstats_data, "multiqc_stacks_sumstats")

        ### Write the sample table
        config_table = {"id": "gstacks_table", "namespace": "stacks"}
        self.add_section(
            name="Sample statistics",
            anchor="stacks-gstacks",
            description="The sample specific statistics for Stacks",
            helptext=
            """**Note!** The sample names have the following scheme `<run folder name> | <input fastq file prefix>`.
                        This data is obtained from the gstacks program run after builing sample and catalog loci merge
                        paired-ends and call variants.
                        These numbers are obtained from the `gstacks.log.distribs` file""",
            plot=table.plot(self.cov_data, self.gsheaders, config_table),
        )
        # Write population sumstats table
        config_table = {"id": "sumstats_table", "namespace": "stacks"}
        self.add_section(
            name="Population summary statistics",
            anchor="stacks-sumstats",
            description=
            "Population statistics as calculated from variant sites found in this run",
            helptext=
            """**Note!** The sample names have the following scheme `<run folder name> | <population ID>`,
                        where the population ID is defined in the input population map file.
                        This information is obtained from the Stacks program `population` and the file populations.sumstats_summary.tsv
                        """,
            plot=table.plot(self.sumstats_data, self.sheaders, config_table),
        )
        config_distribs = {
            "id":
            "distribs_plot",
            "title":
            "Stacks: Population plots",
            "namespace":
            "stacks",
            "tt_label":
            "{point.y} loci, {point.x} samples/SNPs",
            "ylab":
            "# loci",
            "data_labels": [
                {
                    "name": "Samples per loci",
                    "ylab": "# loci",
                    "xlab": "# samples"
                },
                {
                    "name": "SNPs per loci",
                    "ylab": "# loci",
                    "xlab": "# SNPs"
                },
            ],
        }
        self.add_section(
            name="Population plots",
            anchor="stacks-distribs",
            description=
            "Plots showing, 1) the number of loci shared by number of samples and 2) the number of SNPs per sample",
            helptext=
            """The distributions are obtained from the Stacks program `populations` and it's output file `populations.log.distribs`.
            These numbers are Stacks' post-filtering.""",
            plot=linegraph.plot([self.distribs_loci, self.distribs_snps],
                                config_distribs),
        )
Ejemplo n.º 56
0
 def odgi_stats_table(self):
     """
     Detailed odgi stats in this extra table.
     """
     headers = OrderedDict()
     headers["length"] = {
         "title": "Length",
         "description": "Graph length in nucleotides.",
         "scale": "BuPu",
         "format": "{:,.0f}",
     }
     headers["nodes"] = {
         "title": "Nodes",
         "description": "Number of nodes in the graph.",
         "scale": "OrRd",
         "format": "{:,.0f}",
     }
     headers["edges"] = {
         "title": "Edges",
         "description": "Number of edges in the graph.",
         "scale": "PuBu",
         "format": "{:,.0f}",
     }
     headers["paths"] = {
         "title": "Paths",
         "description": "Number of paths in the graph.",
         "scale": "Greens",
         "format": "{:,.0f}",
     }
     headers["components"] = {
         "title": "Components",
         "description":
         "Number of weakly connected components in the graph.",
         "scale": "Oranges",
         "format": "{:,.0f}",
     }
     headers["A"] = {
         "title": "A",
         "description": "Number of adenine bases in the graph.",
         "scale": "Spectral",
         "format": "{:,.0f}",
         "shared_key": "nucleotides",
     }
     headers["C"] = {
         "title": "C",
         "description": "Number of cytosine bases in the graph.",
         "scale": "Greys",
         "format": "{:,.0f}",
         "shared_key": "nucleotides",
     }
     headers["T"] = {
         "title": "T",
         "description": "Number of thymine bases in the graph.",
         "scale": "Blues",
         "format": "{:,.0f}",
         "shared_key": "nucleotides",
     }
     headers["G"] = {
         "title": "G",
         "description": "Number of guanine bases in the graph.",
         "scale": "RdPu",
         "format": "{:,.0f}",
         "shared_key": "nucleotides",
     }
     headers["N"] = {
         "title": "N",
         "description": "Number of `N` basis in the graph.",
         "scale": "Set3",
         "format": "{:,.0f}",
         "shared_key": "nucleotides",
     }
     headers["total"] = {
         "title": "Self Loops Nodes",
         "description":
         "Total number of nodes having self loops in the graph.",
         "scale": "Set1",
         "hidden": True,
         "format": "{:,.0f}",
     }
     headers["unique"] = {
         "title": "Unique Self Loops Nodes",
         "description":
         "Number of unique nodes having self loops in the graph.",
         "scale": "Set2",
         "hidden": True,
         "format": "{:,.0f}",
     }
     headers["pct_gc"] = {
         "title": "% GC",
         "description": "Percent of G/C bases in the graph.",
         "scale": "Spectral",
         "max": 100,
         "min": 0,
         "suffix": "%",
         "hidden": True,
     }
     headers["pct_n"] = {
         "title": "% N",
         "description": "Percent of N bases in the graph.",
         "scale": "Reds",
         "max": 100,
         "min": 0,
         "suffix": "%",
         "hidden": True,
     }
     # Some of the headers are quite general and can clash with other modules.
     # Prepend odgi_ to keep them unique
     prefix_headers = OrderedDict()
     prefix_data = {}
     for h, v in headers.items():
         prefix_headers[f"odgi_{h}"] = v
     for s_name, d in self.odgi_stats_map.items():
         prefix_data[s_name] = {}
         for h in headers:
             prefix_data[s_name][f"odgi_{h}"] = d[h]
     tconfig = {
         "id": "odgi_table",
         "namespace": "ODGI",
         "table_title": "ODGI Stats",
     }
     self.add_section(
         name="Detailed ODGI stats table.",
         anchor="extended_odgi_stats",
         plot=table.plot(prefix_data, prefix_headers, tconfig),
     )
Ejemplo n.º 57
0
    def parse_bamPEFragmentSize(self):
        """Find bamPEFragmentSize output. Only the output from --table is supported."""
        self.deeptools_bamPEFragmentSize = dict()
        for f in self.find_log_files('deeptools/bamPEFragmentSize'):
            parsed_data = self.parseBamPEFile(f)
            for k, v in parsed_data.items():
                if k in self.deeptools_bamPEFragmentSize:
                    log.warning("Replacing duplicate sample {}.".format(k))
                self.deeptools_bamPEFragmentSize[k] = v

            if len(parsed_data) > 0:
                self.add_data_source(f, section='bamPEFragmentSize')
        if len(self.deeptools_bamPEFragmentSize) > 0:
            headersSE = OrderedDict()
            headersSE["Reads Sampled"] = {
                'title': '# Sampled',
                'description': 'Number of reads sampled',
                'format': '{:,.0f}'
            }
            headersSE["Read Len. Min."] = {
                'title': 'Min',
                'description': 'Minimum read length',
                'format': '{:,.0f}',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. 1st. Qu."] = {
                'title': '1st Quartile',
                'description': '1st quartile read length',
                'format': '{:,.0f}',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. Mean"] = {
                'title': 'Mean',
                'description': 'Mean read length',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. Median"] = {
                'title': 'Median',
                'description': 'Median read length',
                'format': '{:,.0f}',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. 3rd Qu."] = {
                'title': '3rd Quartile',
                'description': '3rd quartile read length',
                'format': '{:,.0f}',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. Max"] = {
                'title': 'Max',
                'description': 'Maximum read length',
                'format': '{:,.0f}',
                'shared_key': 'read_length'
            }
            headersSE["Read Len. Std."] = {
                'title': 'Std. Dev.',
                'description': 'read length standard deviation',
                'shared_key': 'read_length'
            }
            headersSE["Read Med. Abs. Dev."] = {
                'title': 'MAD',
                'description': 'read length median absolute deviation',
                'shared_key': 'read_length'
            }
            config = {'namespace': 'deepTools bamPEFragmentSize'}
            self.add_section(name="Read lengths",
                             anchor="bamPEFragmentSize",
                             plot=table.plot(self.deeptools_bamPEFragmentSize, headersSE, config))

            headersPE = OrderedDict()
            headersPE["Frag. Sampled"] = {
                'title': '# Sampled',
                'description': 'Number of fragments sampled',
                'format': '{:,.0f}'
            }
            headersPE["Frag. Len. Min."] = {
                'title': 'Min',
                'description': 'Minimum fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. 1st. Qu."] = {
                'title': '1st Quartile',
                'description': '1st quartile fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. Mean"] = {
                'title': 'Mean',
                'description': 'Mean fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. Median"] = {
                'title': 'Median',
                'description': 'Median fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. 3rd Qu."] = {
                'title': '3rd Quartile',
                'description': '3rd quartile fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. Max"] = {
                'title': 'Max',
                'description': 'Maximum fragment length',
                'format': '{:,.0f}',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Len. Std."] = {
                'title': 'Std. Dev.',
                'description': 'Fragment length standard deviation',
                'shared_key': 'frag_length'
            }
            headersPE["Frag. Med. Abs. Dev."] = {
                'title': 'MAD',
                'description': 'Fragment length median absolute deviation',
                'shared_key': 'frag_length'
            }

            # Are there any PE datasets?
            PE = False
            for k, v in self.deeptools_bamPEFragmentSize.items():
                if 'Frag. Len. Min.' in v:
                    PE = True
                    break
            if PE:
                self.add_section(name="Fragment lengths",
                                 anchor="bamPEFragmentSize",
                                 plot=table.plot(self.deeptools_bamPEFragmentSize, headersPE, config))

            # Read length plot
            config = {'data_labels': [dict(name="Read length distribution", title="Read length distribution", ylab="Read length (bases)"),
                                      dict(name="Fragment length distribution", title="Fragment length distribution", ylab="Fragment length (bases)")],
                      'id': 'bamPEFragmentSize',
                      'title': 'Read/Fragment length distribution',
                      'namespace': 'deepTools bamPEFragmentSize',
                      'ylab': "Read length (bases)",
                      'xlab': "Percentile"}
            SE = dict()
            PE = dict()
            for k, v in self.deeptools_bamPEFragmentSize.items():
                SE[k] = {0: v['Read Len. Min.'],
                         10: v['Read Len. 10%'],
                         20: v['Read Len. 20%'],
                         25: v['Read Len. 1st. Qu.'],
                         30: v['Read Len. 30%'],
                         40: v['Read Len. 40%'],
                         50: v['Read Len. Median'],
                         60: v['Read Len. 60%'],
                         70: v['Read Len. 70%'],
                         75: v['Read Len. 3rd Qu.'],
                         80: v['Read Len. 80%'],
                         90: v['Read Len. 90%'],
                         99: v['Read Len. 99%'],
                         100: v['Read Len. Max']}
                if 'Frag. Len. Min.' not in v:
                    continue
                PE[k] = {0: v['Frag. Len. Min.'],
                         10: v['Frag. Len. 10%'],
                         20: v['Frag. Len. 20%'],
                         25: v['Frag. Len. 1st. Qu.'],
                         30: v['Frag. Len. 30%'],
                         40: v['Frag. Len. 40%'],
                         50: v['Frag. Len. Median'],
                         60: v['Frag. Len. 60%'],
                         70: v['Frag. Len. 70%'],
                         75: v['Frag. Len. 3rd Qu.'],
                         80: v['Frag. Len. 80%'],
                         90: v['Frag. Len. 90%'],
                         99: v['Frag. Len. 99%'],
                         100: v['Frag. Len. Max']}
            self.add_section(name="Read/fragment length distribution",
                             anchor="bamPEFragmentSize",
                             plot=linegraph.plot([SE, PE], config))

        return len(self.deeptools_bamPEFragmentSize)
Ejemplo n.º 58
0
    def add_instrument_data(self):

        bgcols = {"low confidence": "#f8d7da", "medium confidence": "#fff3cd", "high confidence": "#d1e7dd"}
        cond_formatting_rules = {
            "pass": [{"s_eq": "high confidence"}],
            "warn": [{"s_eq": "medium confidence"}],
            "fail": [{"s_eq": "low confidence"}],
        }

        general_data = {}
        for sample, instrument_data in self.instrument.items():
            general_data[sample] = {
                "instrument": " / ".join(sorted(instrument_data.get("Instrument").split(" or "))),
                "confidence": instrument_data.get("Confidence"),
            }

        general_headers = OrderedDict()
        general_headers["instrument"] = {
            "title": "Predicted Instrument",
            "description": "Predicted instrument from ngsderive",
        }
        general_headers["confidence"] = {
            "title": "Instrument: Confidence",
            "description": "Level of confidence (low, medium, high) that the predicted instrument is correct.",
            "bgcols": bgcols,
            "cond_formatting_rules": cond_formatting_rules,
            "hidden": True,
        }
        self.general_stats_addcols(general_data, general_headers)

        instruments = set()

        for d in general_data.values():
            instruments.update(d.get("instrument").split(" / "))

        # move multiple instruments to the end if it exists
        instruments = sorted(instruments)
        if "multiple instruments" in instruments:
            instruments.remove("multiple instruments")
            instruments.append("multiple instruments")

        headers = OrderedDict()
        for instrument in instruments:
            headers[instrument] = {
                "title": instrument,
                "description": f"Predicted {instrument} from ngsderive",
                "bgcols": bgcols,
                "cond_formatting_rules": cond_formatting_rules,
            }
        headers["basis"] = {
            "title": "Instrument: Basis",
            "description": "Basis upon which the prediction was made.",
        }

        table_data = {}
        for sample, instrument_data in self.instrument.items():
            table_data[sample] = {}
            for instrument in instrument_data.get("Instrument").split(" or "):
                table_data[sample][instrument] = instrument_data.get("Confidence")
            table_data[sample]["basis"] = instrument_data.get("Basis")

        # Config for the plot
        config = {
            "id": "ngsderive_instruments_plot",
            "title": "ngsderive: Instruments",
        }

        self.add_section(
            name="Instrument",
            anchor="ngsderive-instrument",
            description="""Predicted instrument provided by ngsderive. For more information, please see
            [the documentation](https://stjudecloud.github.io/ngsderive/subcommands/instrument/).""",
            plot=table.plot(table_data, headers, config),
        )
Ejemplo n.º 59
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name='SomaticQC',
            anchor='somaticqc',
            href="https://github.com/imgag/ngs-bits",
            info="calculates QC metrics based on tumor-normal pairs.")

        # quality parameters from qcML with name, accession, description
        self.qcml = dict()
        # qc data for each sample
        self.qcdata = dict()
        # parse qcml files
        for f in self.find_log_files('somaticqc',
                                     filecontents=True,
                                     filehandles=False):
            self.add_data_source(f)
            s_name = self.clean_s_name(f['s_name'], f['root'])
            # try to split Sample1-Sample2 names
            ms = re.match(r'([^-]+)-[^-]+', s_name)
            if (ms):
                s_name = ms.group(1)
            self.qcdata[s_name] = self.parse_qcml(f['f'])

        # ignore samples if requested
        self.qcdata = self.ignore_samples(self.qcdata)

        # warn if no samples found
        if len(self.qcdata) == 0:
            raise UserWarning

        # parse somatic variant rate value
        for _, kv in self.qcdata.items():
            try:
                kv['somatic variant rate'] = float(
                    re.sub(r'(low|moderate|high) \(([0-9.]+) var/Mb\)', '\2',
                           kv['somatic variant rate']))
            except ValueError:
                kv.pop('somatic variant rate')

        # prepare table headers, use name and description from qcML
        headers = {
            qp_key: {
                'namespace': "SomaticQC",
                'title': qp_key,
                'description': qp_entry['description'],
            }
            for qp_key, qp_entry in self.qcml.items()
        }

        headers['sample correlation'].update({
            'format': '{:,.2f}',
            'max': 1,
            'scale': 'BuGn'
        })
        headers['variant count'].update({
            'format': '{:,.0f}',
            'title': 'variant count',
            'scale': 'Reds'
        })
        headers['somatic variant count'].update({
            'format': '{:,.0f}',
            'scale': 'RdPu'
        })
        headers['known somatic variants %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'max': 100,
            'scale': 'Oranges'
        })
        headers['somatic indel %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'minRange': 20,
            'ceiling': 20,
            'scale': 'Purples'
        })
        headers['somatic variant rate'].update({
            'suffix': 'Variants/Mb',
            'format': '{:,.2f}',
            'min': 0,
            'minRange': 10,
            'ceiling': 10,
            'scale': 'Blues'
        })

        try:
            headers['somatic transition/transversion ratio'].update({
                'format':
                '{:,.2f}',
                'minRange':
                5,
                'ceiling':
                5,
                'scale':
                'RdYlGn'
            })
        except KeyError:
            pass

        try:
            headers['tumor content estimate'].update({
                'suffix': '%',
                'format': '{:,.2f}',
                'max': 100,
                'scale': 'Greens'
            })
        except KeyError:
            pass

        # rename 'variant count' key to prevent duplicate ID with 'variant count' from VariantQC
        headers['variant count (SomaticQC)'] = headers.pop('variant count')
        for _, kv in self.qcdata.items():
            kv['variant count (SomaticQC)'] = kv.pop('variant count')

        # general table: add read count and bases usable
        self.general_stats_addcols(
            self.qcdata,
            self.dict_ordered_subset(
                headers, ('sample correlation', 'somatic variant count',
                          'known somatic variants %')))

        # write full data set to file
        self.write_data_file(self.qcdata, 'multiqc_somaticqc')

        # table with general values
        self.add_section(
            name='Overview',
            anchor='somaticqc-general',
            description='',
            plot=table.plot(
                self.qcdata,
                self.dict_ordered_subset(
                    headers,
                    ('sample correlation', 'variant count somaticqc',
                     'somatic variant count', 'known somatic variants %',
                     'somatic indel %',
                     'somatic transition/transversion ratio',
                     'somatic variant rate', 'tumor content estimate')),
                pconfig={'namespace': 'SomaticQC'}))

        # bar plot with variant count values
        self.add_section(
            name='Somatic Variant Count',
            anchor='somaticqc-somatic-variant-count',
            description=self.make_description(['somatic variant count']),
            plot=bargraph.plot(self.qcdata,
                               self.dict_ordered_subset(
                                   headers, ('somatic variant count', )),
                               pconfig={
                                   'namespace': 'SomaticQC',
                                   'id':
                                   'somaticqc-somatic-variant-count-plot',
                                   'title': 'SomaticQC: Somatic Variant Count',
                                   'ylab': 'count',
                                   'yDecimals': False,
                                   'cpswitch': False,
                                   'tt_decimals': 0,
                                   'tt_suffix': '',
                                   'tt_percentages': False
                               }))
Ejemplo n.º 60
0
    def addTable(self):
        """Take the parsed stats from MultiVCFAnalyzer and add it to the MVCF Table"""
        headers = OrderedDict()

        headers["allPos"] = {
            "title": "Bases in Final Alignment",
            "description": "Length of FASTA file in base pairs (bp)",
            "scale": "BrBG",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }
        headers["SNP Calls (all)"] = {
            "title": "SNPs",
            "description": "Total number of non-reference calls made",
            "scale": "OrRd",
            "shared_key": "snp_call",
            "format": "{:,.0f}",
        }
        headers["Heterozygous SNP alleles (percent)"] = {
            "title": "% Hets",
            "description": "Percentage of heterozygous SNP alleles",
            "scale": "PuBu",
            "max": 100,
            "min": 0,
        }
        headers["SNP Calls (hom)"] = {
            "title": "Hom SNPs",
            "description":
            "Total number of non-reference calls passing homozygosity thresholds",
            "scale": "RdYlGn",
            "shared_key": "snp_call",
            "format": "{:,.0f}",
        }
        headers["SNP Calls (het)"] = {
            "title": "Het SNPs",
            "description":
            "Total number of non-reference calls not passing homozygosity thresholds",
            "scale": "RdYlGn",
            "shared_key": "snp_call",
            "format": "{:,.0f}",
        }
        headers["discardedVarCall"] = {
            "title": "Discarded SNP Call",
            "description":
            "Number of non-reference positions not reaching genotyping or coverage thresholds",
            "scale": "PuCr",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }
        headers["filteredVarCall"] = {
            "title": "Filtered SNP Call",
            "description":
            "Number of positions ignored defined in user-supplied filter list",
            "scale": "RdGy",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }
        headers["refCall"] = {
            "title": "Reference Calls",
            "description": "Number of reference calls made",
            "scale": "Spectral",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }
        headers["discardedRefCall"] = {
            "title": "Discarded Reference Call",
            "description":
            "Number of reference positions not reaching genotyping or coverage thresholds",
            "scale": "YlGnBu",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }
        headers["coverage (fold)"] = {
            "title": "Average Call Coverage",
            "description": "Average number of reads covering final calls",
            "scale": "OrRd",
            "shared_key": "coverage",
            "suffix": "X",
        }
        headers["coverage (percent)"] = {
            "title": "% Reference with Calls",
            "description":
            "Percent coverage of all positions with final calls",
            "scale": "PuBuGn",
            "shared_key": "coverage",
            "suffix": "%",
            "max": 100,
            "min": 0,
        }
        headers["unhandledGenotype"] = {
            "title": "Unhandled Genotypes",
            "description":
            "Number of positions discarded due to presence of more than one alternate allele",
            "scale": "BuPu",
            "shared_key": "snp_count",
            "format": "{:,.0f}",
        }
        headers["noCall"] = {
            "title": "Positions with No Call",
            "description":
            "Number of positions with no call made as reported by GATK",
            "scale": "GnBu",
            "shared_key": "calls",
            "format": "{:,.0f}",
        }

        # Separate table config
        table_config = {
            "namespace":
            "MultiVCFAnalyzer",  # Name for grouping. Prepends desc and is in Config Columns modal
            "id": "mvcf-table",  # ID used for the table
            "table_title":
            "MultiVCFAnalyzer Results",  # Title of the table. Used in the column config modal
        }
        tab = table.plot(self.mvcf_data, headers, table_config)
        return tab