def qorts_genebodycoverage_plot(self): """ Make a beeswarm plot of the GeneBodyCoverage values """ keys = [ 'GeneBodyCoverage_Overall_Mean', 'GeneBodyCoverage_Overall_Median', 'GeneBodyCoverage_LowExpress_Mean', 'GeneBodyCoverage_LowExpress_Median', 'GeneBodyCoverage_UMQuartile_Mean', 'GeneBodyCoverage_UMQuartile_Median' ] cats = OrderedDict() for k in keys: name = k.replace('GeneBodyCoverage_', '') name = name.replace('_', ' ') name = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", name) cats[k] = { 'title': name, 'min': 0, 'max': 1, } # Config for the plot pconfig = { 'id': 'qorts_gene_body_coverage', 'title': 'QoRTs: Gene Body Coverage' } self.add_section(name='Gene Body Coverage', plot=beeswarm.plot(self.qorts_data, cats, pconfig))
def bam_statplot(self): pconfig = {"id": "rna_seqc_bam_stat_beeswarm", "title": "RNA-SeQC: Read metrics"} columns = [ "Total Read Number", "Alternative Alignments", "Chimeric Reads", "Duplicate Reads", "End 1 Mapped Reads", "End 2 Mapped Reads", "End 1 Mismatches", "End 2 Mismatches", "End 1 Sense", "End 2 Sense", "Ambiguous Reads", "High Quality Reads", "Low Quality Reads", "Mapped Duplicate Reads", "Mapped Reads", "Mapped Unique Reads", "Non-Globin Reads", "Non-Globin Duplicate Reads", "rRNA Reads", "Unique Mapping, Vendor QC Passed Reads", ] keys = OrderedDict() for col in columns: keys[col] = {"title": col, "shared_key": "read_count", "suffix": config.read_count_prefix} self.add_section( name="Read Counts", anchor="rna_seqc_bam_stat", description="Number of reads ({}) falling into different categories.".format(config.read_count_desc), helptext="Note that many of these statistics are only available from RNA-SeQC v2.x", plot=beeswarm.plot(self.rna_seqc_metrics, keys, pconfig), )
def bam_statplot(self): pconfig = { 'id': 'rna_seqc_bam_stat_beeswarm', 'title': 'RNA-SeQC: Read metrics' } columns = [ 'Total Read Number', 'Alternative Alignments', 'Chimeric Reads', 'Duplicate Reads', 'End 1 Mapped Reads', 'End 2 Mapped Reads', 'End 1 Mismatches', 'End 2 Mismatches', 'End 1 Sense', 'End 2 Sense', 'Ambiguous Reads', 'High Quality Reads', 'Low Quality Reads', 'Mapped Duplicate Reads', 'Mapped Reads', 'Mapped Unique Reads', 'Non-Globin Reads', 'Non-Globin Duplicate Reads', 'rRNA Reads', 'Unique Mapping, Vendor QC Passed Reads' ] keys = OrderedDict() for col in columns: keys[col] = { 'title': col, 'shared_key': 'read_count', 'suffix': config.read_count_prefix } self.add_section( name='Read Counts', anchor='rna_seqc_bam_stat', description= 'Number of reads ({}) falling into different categories.'.format( config.read_count_desc), helptext= 'Note that many of these statistics are only available from RNA-SeQC v2.x', plot=beeswarm.plot(self.rna_seqc_metrics, keys, pconfig))
def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if modname == '' or modname is None: modname = 'Custom Content' # Initialise the parent object super(MultiqcModule, self).__init__( name = modname, anchor = mod['config'].get('section_anchor', c_id), href = mod['config'].get('section_href'), info = mod['config'].get('description') ) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') self.add_section( plot = table.plot(mod['data'], headers, pconfig) ) self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) ) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) ) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.add_section( plot = linegraph.plot(mod['data'], pconfig) ) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.add_section( plot = scatter.plot(mod['data'], pconfig) ) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) ) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.add_section( plot = beeswarm.plot(mod['data'], pconfig) ) # Raw HTML elif mod['config'].get('plot_type') == 'html': self.add_section( content = mod['data'] ) # Raw image file as html elif mod['config'].get('plot_type') == 'image': self.add_section( content = mod['data'] ) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
def comp_qm_geometry_descriptor_chart(self, geometry_descriptor="bonds", min=0.0, max=2.0): """ Make the geometry_decriptor section tables and plots""" headers = OrderedDict() sample_keys = list(self.comp_qm_data[geometry_descriptor].keys()) for key in self.comp_qm_data[geometry_descriptor][ sample_keys[0]].keys(): # hack to get keys from first data set headers[key] = { 'title': key, 'description': 'Estimated Mulliken Charge', 'suffix': '', 'scale': 'Spectral', # 'RdBu', 'dmin': min, 'dmax': max, 'ceiling': max, 'floor': min, # known issue with a negative range 'format': '{:,.2f}', 'shared_key': 'torsion_range' } log.debug(str(headers)) # Config for the plot config = { 'namespace': 'Geometry', 'id': 'comp_qm_geometry', 'title': 'Geometry' } self.add_section(name=geometry_descriptor, anchor='comp_qm_geometry', plot=table.plot( self.comp_qm_data[geometry_descriptor], headers, config)) #.. using dmin to ensure correct colour scales for negative ranges, note that min is still required for the beeswarm plot and that behaviour cancels out (?) #.. TLDR, the benefit of specifying a range is to keep all the graphs consistent. Maybe require an automated range check, rather than hardcoded. Try without if you like. for key in self.comp_qm_data[geometry_descriptor][ sample_keys[0]].keys(): # hack to get keys from first data set headers[key] = { 'title': key, 'description': 'Estimated Mulliken Charge', 'suffix': '', 'scale': 'Spectral', # 'RdBu', 'min': min, 'max': max, 'ceiling': max, 'floor': min, # known issue with a negative range 'format': '{:,.2f}', 'shared_key': 'torsion_range' } self.add_section(name=geometry_descriptor, anchor='comp_qm_geometry', plot=beeswarm.plot( self.comp_qm_data[geometry_descriptor], headers, config))
def comp_qm_mulliken_chart(self): """ Make the mulliken section table and plots """ headers = OrderedDict() sample_keys = list(self.comp_qm_data['mulliken'].keys()) for key in self.comp_qm_data['mulliken'][ sample_keys[0]].keys(): # hack to get keys from first data set headers[key] = { 'title': key, 'description': 'Mulliken Charge', 'suffix': '', 'scale': 'Spectral', # 'RdBu', 'dmin': -2.0, # colour scale is set with dmin rather than min? see table.py c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax']) 'dmax': 2.0, 'ceiling': 2.0, 'floor': -2.0, # known issue with a negative range 'format': '{:,.2f}', 'shared_key': 'mulliken_range' } log.debug(str(headers)) # Config for the plot config = { 'namespace': 'comp_qm', 'id': 'comp_qm_mulliken', 'title': 'Mulliken Charges', } self.add_section(name='Mulliken Charges', anchor='comp_qm_mulliken_table', plot=table.plot(self.comp_qm_data['mulliken'], headers, config)) #.. using dmin to ensure correct colour scales for negative ranges, note that min is still required for the beeswarm plot and that behaviour cancels out (?) #.. TLDR, the benefit of specifying a range is to keep all the graphs consistent. Maybe require an automated range check, rather than hardcoded. Try without if you like. #.. still seems strange, what is wrong. see table.py c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax'] and ../utils/mqc_colour.py for key in self.comp_qm_data['mulliken'][ sample_keys[0]].keys(): # hack to get keys from first data set headers[key] = { 'title': key, 'description': 'Mulliken Charge', 'suffix': '', 'scale': 'Spectral', # 'RdBu', 'min': -2.0, # colour scale is set with dmin rather than min? see table.py c_scale = mqc_colour.mqc_colour_scale(header['scale'], header['dmin'], header['dmax']) 'max': 2.0, 'ceiling': 2.0, 'floor': -2.0, # known issue with a negative range 'format': '{:,.2f}', 'shared_key': 'mulliken_range' } self.add_section(name='Mulliken Charges: Beeswarm plot', anchor='comp_qm_mulliken_beeswarm', plot=beeswarm.plot(self.comp_qm_data['mulliken'], headers, config))
def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) # Initialise the parent object super(MultiqcModule, self).__init__(name=modname, anchor=mod['config'].get('section_anchor', c_id), href=mod['config'].get('section_href'), info=mod['config'].get('description')) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) self.intro += table.plot(mod['data'], None, pconfig) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.intro += bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.intro += linegraph.plot(mod['data'], pconfig) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.intro += scatter.plot(mod['data'], pconfig) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.intro += heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.intro += beeswarm.plot(mod['data'], pconfig) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod['config'].get('plot_type'), c_id))
def bismark_methlyation_chart(self): """ Make the methylation plot """ # Config for the plot keys = OrderedDict() defaults = {'max': 100, 'min': 0, 'suffix': '%', 'decimalPlaces': 1} keys['percent_cpg_meth'] = dict(defaults, **{'title': 'Methylated CpG'}) keys['percent_chg_meth'] = dict(defaults, **{'title': 'Methylated CHG'}) keys['percent_chh_meth'] = dict(defaults, **{'title': 'Methylated CHH'}) return beeswarm.plot(self.bismark_data['methextract'], keys, {'id': 'bismark-methylation-dp'})
def bismark_methlyation_chart(self): """ Make the methylation plot """ # Config for the plot keys = OrderedDict() defaults = {"max": 100, "min": 0, "suffix": "%", "decimalPlaces": 1} keys["percent_cpg_meth"] = dict(defaults, **{"title": "Methylated CpG"}) keys["percent_chg_meth"] = dict(defaults, **{"title": "Methylated CHG"}) keys["percent_chh_meth"] = dict(defaults, **{"title": "Methylated CHH"}) self.add_section( name="Cytosine Methylation", anchor="bismark-methylation", plot=beeswarm.plot(self.bismark_data["methextract"], keys, {"id": "bismark-methylation-dp"}), )
def mirtop_beeswarm_section(self, stat_string): """ Generate more detailed beeswarm plots, for a given stat type""" log.info("Plotting " + stat_string + " section.") section_data = dict() for sample_name, sample_data in viewitems(self.mirtop_data): section_keys = [ key for key in list(sample_data.keys()) if stat_string in key ] section_data[sample_name] = dict( (k, sample_data[k]) for k in section_keys) # Create comprehensive beeswarm plots of all stats self.add_section(name='Read ' + stat_string + 's', anchor='mirtop-stats-' + stat_string, description="Detailed summary stats", plot=beeswarm.plot(section_data))
def bismark_methlyation_chart (self): """ Make the methylation plot """ # Config for the plot keys = OrderedDict() defaults = { 'max': 100, 'min': 0, 'suffix': '%', 'decimalPlaces': 1 } keys['percent_cpg_meth'] = dict(defaults, **{ 'title': 'Methylated CpG' }) keys['percent_chg_meth'] = dict(defaults, **{ 'title': 'Methylated CHG' }) keys['percent_chh_meth'] = dict(defaults, **{ 'title': 'Methylated CHH' }) self.add_section ( name = 'Cytosine Methylation', anchor = 'bismark-methylation', plot = beeswarm.plot(self.bismark_data['methextract'], keys, {'id': 'bismark-methylation-dp'}) )
def parse_samtools_stats(self): """ Find Samtools stats logs and parse their data """ self.samtools_stats = dict() for f in self.find_log_files('samtools/stats'): parsed_data = dict() for line in f['f'].splitlines(): if not line.startswith("SN"): continue sections = line.split("\t") field = sections[1].strip()[:-1] field = field.replace(' ', '_') value = float(sections[2].strip()) parsed_data[field] = value if len(parsed_data) > 0: # Work out some percentages if 'raw_total_sequences' in parsed_data: for k in list(parsed_data.keys()): if k.startswith( 'reads_' ) and k != 'raw_total_sequences' and parsed_data[ 'raw_total_sequences'] > 0: parsed_data['{}_percent'.format(k)] = ( parsed_data[k] / parsed_data['raw_total_sequences']) * 100 if f['s_name'] in self.samtools_stats: log.debug( "Duplicate sample name found! Overwriting: {}".format( f['s_name'])) self.add_data_source(f, section='stats') self.samtools_stats[f['s_name']] = parsed_data # Filter to strip out ignored sample names self.samtools_stats = self.ignore_samples(self.samtools_stats) if len(self.samtools_stats) > 0: # Write parsed report data to a file self.write_data_file(self.samtools_stats, 'multiqc_samtools_stats') # General Stats Table stats_headers = OrderedDict() stats_headers['error_rate'] = { 'title': 'Error rate', 'description': 'Error rate using CIGAR', 'min': 0, 'max': 100, 'suffix': '%', 'scale': 'OrRd', 'format': '{:,.2f}', 'modify': lambda x: x * 100.0 } stats_headers['non-primary_alignments'] = { 'title': '{} Non-Primary'.format(config.read_count_prefix), 'description': 'Non-primary alignments ({})'.format(config.read_count_desc), 'min': 0, 'scale': 'PuBu', 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count' } stats_headers['reads_mapped'] = { 'title': '{} Reads Mapped'.format(config.read_count_prefix), 'description': 'Reads Mapped in the bam file ({})'.format( config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count' } stats_headers['reads_mapped_percent'] = { 'title': '% Mapped', 'description': '% Mapped Reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn' } stats_headers['raw_total_sequences'] = { 'title': '{} Total seqs'.format(config.read_count_prefix), 'description': 'Total sequences in the bam file ({})'.format( config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count' } self.general_stats_addcols(self.samtools_stats, stats_headers, 'Samtools Stats') # Make bargraph plot of mapped/unmapped reads self.alignment_section(self.samtools_stats) # Make dot plot of counts keys = OrderedDict() reads = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'M reads', 'decimalPlaces': 2, 'shared_key': 'read_count' } bases = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'M bases', 'decimalPlaces': 2, 'shared_key': 'base_count' } keys['raw_total_sequences'] = dict(reads, **{'title': 'Total sequences'}) keys['reads_mapped_and_paired'] = dict( reads, **{ 'title': 'Mapped & paired', 'description': 'Paired-end technology bit set + both mates mapped' }) keys['reads_properly_paired'] = dict( reads, **{ 'title': 'Properly paired', 'description': 'Proper-pair bit set' }) keys['reads_duplicated'] = dict( reads, **{ 'title': 'Duplicated', 'description': 'PCR or optical duplicate bit set' }) keys['reads_QC_failed'] = dict(reads, **{'title': 'QC Failed'}) keys['reads_MQ0'] = dict( reads, **{ 'title': 'Reads MQ0', 'description': 'Reads mapped and MQ=0' }) keys['bases_mapped_(cigar)'] = dict( bases, **{ 'title': 'Mapped bases (cigar)', 'description': 'Mapped bases (cigar)' }) keys['bases_trimmed'] = dict(bases, **{'title': 'Bases Trimmed'}) keys['bases_duplicated'] = dict(bases, **{'title': 'Duplicated bases'}) keys['pairs_on_different_chromosomes'] = dict( reads, **{ 'title': 'Diff chromosomes', 'description': 'Pairs on different chromosomes' }) keys['pairs_with_other_orientation'] = dict( reads, **{ 'title': 'Other orientation', 'description': 'Pairs with other orientation' }) keys['inward_oriented_pairs'] = dict( reads, **{ 'title': 'Inward pairs', 'description': 'Inward oriented pairs' }) keys['outward_oriented_pairs'] = dict( reads, **{ 'title': 'Outward pairs', 'description': 'Outward oriented pairs' }) self.add_section( name='Alignment metrics', anchor='samtools-stats', description= "This module parses the output from <code>samtools stats</code>. All numbers in millions.", plot=beeswarm.plot(self.samtools_stats, keys, {'id': 'samtools-stats-dp'})) # Return the number of logs that were found return len(self.samtools_stats)
def add_cc_section(self, c_id, mod): section_name = mod["config"].get("section_name", c_id.replace("_", " ").title()) if section_name == "" or section_name is None: section_name = "Custom Content" section_description = mod["config"].get("description", "") pconfig = mod["config"].get("pconfig", {}) if pconfig.get("title") is None: pconfig["title"] = section_name plot = None content = None # Table if mod["config"].get("plot_type") == "table": pconfig["sortRows"] = pconfig.get("sortRows", False) headers = mod["config"].get("headers") plot = table.plot(mod["data"], headers, pconfig) self.write_data_file( mod["data"], "multiqc_{}".format(section_name.lower().replace(" ", "_"))) # Bar plot elif mod["config"].get("plot_type") == "bargraph": plot = bargraph.plot(mod["data"], mod["config"].get("categories"), pconfig) # Line plot elif mod["config"].get("plot_type") == "linegraph": plot = linegraph.plot(mod["data"], pconfig) # Scatter plot elif mod["config"].get("plot_type") == "scatter": plot = scatter.plot(mod["data"], pconfig) # Heatmap elif mod["config"].get("plot_type") == "heatmap": plot = heatmap.plot(mod["data"], mod["config"].get("xcats"), mod["config"].get("ycats"), pconfig) # Beeswarm plot elif mod["config"].get("plot_type") == "beeswarm": plot = beeswarm.plot(mod["data"], pconfig) # Raw HTML elif mod["config"].get("plot_type") == "html": content = mod["data"] # Raw image file as html elif mod["config"].get("plot_type") == "image": content = mod["data"] # Not supplied elif mod["config"].get("plot_type") == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod["config"].get("plot_type"), c_id)) # Don't use exactly the same title / description text as the main module if section_name == self.name: section_name = None if section_description == self.info: section_description = "" self.add_section(name=section_name, anchor=c_id, description=section_description, plot=plot, content=content)
def add_cc_section(self, c_id, mod): section_name = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if section_name == '' or section_name is None: section_name = 'Custom Content' section_description = mod['config'].get('description', '') pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = section_name plot = None content = None # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') plot = table.plot(mod['data'], headers, pconfig) self.write_data_file( mod['data'], "multiqc_{}".format(section_name.lower().replace(' ', '_'))) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) # Line plot elif mod['config'].get('plot_type') == 'linegraph': plot = linegraph.plot(mod['data'], pconfig) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': plot = scatter.plot(mod['data'], pconfig) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': plot = beeswarm.plot(mod['data'], pconfig) # Raw HTML elif mod['config'].get('plot_type') == 'html': content = mod['data'] # Raw image file as html elif mod['config'].get('plot_type') == 'image': content = mod['data'] # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod['config'].get('plot_type'), c_id)) # Don't use exactly the same title / description text as the main module if section_name == self.name: section_name = None if section_description == self.info: section_description = '' self.add_section(name=section_name, anchor=c_id, description=section_description, plot=plot, content=content)
def parse_samtools_stats(self): """Find Samtools stats logs and parse their data""" self.samtools_stats = dict() for f in self.find_log_files("samtools/stats"): parsed_data = dict() for line in f["f"].splitlines(): if not line.startswith("SN"): continue sections = line.split("\t") field = sections[1].strip()[:-1] field = field.replace(" ", "_") value = float(sections[2].strip()) parsed_data[field] = value if len(parsed_data) > 0: # Work out some percentages if "raw_total_sequences" in parsed_data: for k in list(parsed_data.keys()): if ( k.startswith("reads_") and k != "raw_total_sequences" and parsed_data["raw_total_sequences"] > 0 ): parsed_data["{}_percent".format(k)] = ( parsed_data[k] / parsed_data["raw_total_sequences"] ) * 100 if f["s_name"] in self.samtools_stats: log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"])) self.add_data_source(f, section="stats") self.samtools_stats[f["s_name"]] = parsed_data # Filter to strip out ignored sample names self.samtools_stats = self.ignore_samples(self.samtools_stats) if len(self.samtools_stats) > 0: # Write parsed report data to a file self.write_data_file(self.samtools_stats, "multiqc_samtools_stats") # General Stats Table stats_headers = OrderedDict() stats_headers["error_rate"] = { "title": "Error rate", "description": "Error rate: mismatches (NM) / bases mapped (CIGAR)", "min": 0, "max": 100, "suffix": "%", "scale": "OrRd", "format": "{:,.2f}", "modify": lambda x: x * 100.0, } stats_headers["non-primary_alignments"] = { "title": "{} Non-Primary".format(config.read_count_prefix), "description": "Non-primary alignments ({})".format(config.read_count_desc), "min": 0, "scale": "PuBu", "modify": lambda x: x * config.read_count_multiplier, "shared_key": "read_count", } stats_headers["reads_mapped"] = { "title": "{} Reads Mapped".format(config.read_count_prefix), "description": "Reads Mapped in the bam file ({})".format(config.read_count_desc), "min": 0, "modify": lambda x: x * config.read_count_multiplier, "shared_key": "read_count", } stats_headers["reads_mapped_percent"] = { "title": "% Mapped", "description": "% Mapped Reads", "max": 100, "min": 0, "suffix": "%", "scale": "RdYlGn", } stats_headers["reads_properly_paired_percent"] = { "title": "% Proper Pairs", "description": "% Properly Paired Reads", "max": 100, "min": 0, "suffix": "%", "scale": "RdYlGn", "hidden": True if (max([x["reads_mapped_and_paired"] for x in self.samtools_stats.values()]) == 0) else False, } stats_headers["reads_MQ0_percent"] = { "title": "% MapQ 0 Reads", "description": "% of Reads that are Ambiguously Placed (MapQ=0)", "max": 100, "min": 0, "suffix": "%", "scale": "OrRd", "hidden": True, } stats_headers["raw_total_sequences"] = { "title": "{} Total seqs".format(config.read_count_prefix), "description": "Total sequences in the bam file ({})".format(config.read_count_desc), "min": 0, "modify": lambda x: x * config.read_count_multiplier, "shared_key": "read_count", } self.general_stats_addcols(self.samtools_stats, stats_headers) # Make bargraph plot of mapped/unmapped reads self.alignment_section(self.samtools_stats) # Make dot plot of counts keys = OrderedDict() reads = { "min": 0, "modify": lambda x: float(x) / 1000000.0, "suffix": "M reads", "decimalPlaces": 2, "shared_key": "read_count", } bases = { "min": 0, "modify": lambda x: float(x) / 1000000.0, "suffix": "M bases", "decimalPlaces": 2, "shared_key": "base_count", } keys["raw_total_sequences"] = dict(reads, **{"title": "Total sequences"}) keys["reads_mapped_and_paired"] = dict( reads, **{"title": "Mapped & paired", "description": "Paired-end technology bit set + both mates mapped"}, ) keys["reads_properly_paired"] = dict( reads, **{"title": "Properly paired", "description": "Proper-pair bit set"} ) keys["reads_duplicated"] = dict( reads, **{"title": "Duplicated", "description": "PCR or optical duplicate bit set"} ) keys["reads_QC_failed"] = dict(reads, **{"title": "QC Failed"}) keys["reads_MQ0"] = dict(reads, **{"title": "Reads MQ0", "description": "Reads mapped and MQ=0"}) keys["bases_mapped_(cigar)"] = dict( bases, **{"title": "Mapped bases (CIGAR)", "description": "Mapped bases (CIGAR)"} ) keys["bases_trimmed"] = dict(bases, **{"title": "Bases Trimmed"}) keys["bases_duplicated"] = dict(bases, **{"title": "Duplicated bases"}) keys["pairs_on_different_chromosomes"] = dict( reads, **{"title": "Diff chromosomes", "description": "Pairs on different chromosomes"} ) keys["pairs_with_other_orientation"] = dict( reads, **{"title": "Other orientation", "description": "Pairs with other orientation"} ) keys["inward_oriented_pairs"] = dict( reads, **{"title": "Inward pairs", "description": "Inward oriented pairs"} ) keys["outward_oriented_pairs"] = dict( reads, **{"title": "Outward pairs", "description": "Outward oriented pairs"} ) self.add_section( name="Alignment metrics", anchor="samtools-stats", description="This module parses the output from <code>samtools stats</code>. All numbers in millions.", plot=beeswarm.plot(self.samtools_stats, keys, {"id": "samtools-stats-dp"}), ) # Return the number of logs that were found return len(self.samtools_stats)
def parse_samtools_flagstats(self): """ Find Samtools flagstat logs and parse their data """ self.samtools_flagstat = dict() for f in self.find_log_files(config.sp['samtools']['flagstat']): parsed_data = parse_single_report(f['f']) if len(parsed_data) > 0: if f['s_name'] in self.samtools_flagstat: log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name'])) self.add_data_source(f, section='flagstat') self.samtools_flagstat[f['s_name']] = parsed_data if len(self.samtools_flagstat) > 0: # Write parsed report data to a file (restructure first) self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat') # General Stats Table flagstats_headers = dict() flagstats_headers['mapped_passed'] = { 'title': 'M Reads Mapped', 'description': 'Reads Mapped in the bam file', 'min': 0, 'modify': lambda x: x / 1000000, 'shared_key': 'read_count' } self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat') # Make dot plot of counts keys = OrderedDict() reads = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'M reads', 'decimalPlaces': 2, 'shared_key': 'read_count' } keys['flagstat_total'] = dict(reads, title = 'Total Reads' ) keys['total_passed'] = dict(reads, title = 'Total Passed QC' ) keys['mapped_passed'] = dict(reads, title = 'Mapped' ) if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()): keys['secondary_passed'] = dict(reads, title = 'Secondary Alignments' ) if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()): keys['supplementary_passed'] = dict(reads, title = 'Supplementary Alignments' ) keys['duplicates_passed'] = dict(reads, title = 'Duplicates' ) keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' ) keys['properly paired_passed'] = dict(reads, title = 'Properly Paired' ) keys['with itself and mate mapped_passed'] = \ dict(reads, title = 'Self and mate mapped', description = 'Reads with itself and mate mapped' ) keys['singletons_passed'] = dict(reads, title = 'Singletons' ) keys['with mate mapped to a different chr_passed'] = \ dict(reads, title = 'Mate mapped to diff chr', description = 'Mate mapped to different chromosome' ) keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \ dict(reads, title = 'Diff chr (mapQ >= 5)', description = 'Mate mapped to different chromosome (mapQ >= 5)' ) self.sections.append({ 'name': 'Samtools Flagstat', 'anchor': 'samtools-flagstat', 'content': '<p>This module parses the output from <code>samtools flagstat</code>. All numbers in millions.</p>' + beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'}) }) # Return the number of logs that were found return len(self.samtools_flagstat)
def parse_reports(self): """ Find RSeQC bam_stat reports and parse their data """ # Set up vars self.bam_stat_data = dict() regexes = { 'total_records': r"Total records:\s*(\d+)", 'qc_failed': r"QC failed:\s*(\d+)", 'optical_pcr_duplicate': r"Optical/PCR duplicate:\s*(\d+)", 'non_primary_hits': r"Non primary hits\s*(\d+)", 'unmapped_reads': r"Unmapped reads:\s*(\d+)", 'mapq_lt_mapq_cut_non-unique': r"mapq < mapq_cut \(non-unique\):\s*(\d+)", 'mapq_gte_mapq_cut_unique': r"mapq >= mapq_cut \(unique\):\s*(\d+)", 'read_1': r"Read-1:\s*(\d+)", 'read_2': r"Read-2:\s*(\d+)", 'reads_map_to_sense': r"Reads map to '\+':\s*(\d+)", 'reads_map_to_antisense': r"Reads map to '-':\s*(\d+)", 'non-splice_reads': r"Non-splice reads:\s*(\d+)", 'splice_reads': r"Splice reads:\s*(\d+)", 'reads_mapped_in_proper_pairs': r"Reads mapped in proper pairs:\s*(\d+)", 'proper-paired_reads_map_to_different_chrom': r"Proper-paired reads map to different chrom:\s*(\d+)", } #intiate PE check is_paired_end = False # Go through files and parse data using regexes for f in self.find_log_files('rseqc/bam_stat'): d = dict() for k, r in regexes.items(): r_search = re.search(r, f['f'], re.MULTILINE) if r_search: d[k] = int(r_search.group(1)) # Calculate some percentages if 'total_records' in d: t = float(d['total_records']) if 'mapq_gte_mapq_cut_unique' in d: d['unique_percent'] = (float(d['mapq_gte_mapq_cut_unique']) / t) * 100.0 if 'reads_mapped_in_proper_pairs' in d: d['proper_pairs_percent'] = ( float(d['reads_mapped_in_proper_pairs']) / t) * 100.0 if len(d) > 0: if f['s_name'] in self.bam_stat_data: log.debug( "Duplicate sample name found! Overwriting: {}".format( f['s_name'])) self.add_data_source(f, section='bam_stat') #Check if SE or PE if d['read_2'] != 0: is_paired_end = True self.bam_stat_data[f['s_name']] = d # Filter to strip out ignored sample names self.bam_stat_data = self.ignore_samples(self.bam_stat_data) if len(self.bam_stat_data) > 0: # Write to file self.write_data_file(self.bam_stat_data, 'multiqc_rseqc_bam_stat') # Add to general stats table self.general_stats_headers['proper_pairs_percent'] = { 'title': '% Proper Pairs', 'description': '% Reads mapped in proper pairs', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn' } for s_name in self.bam_stat_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() #Only write if PE, i.e. there is something to write if is_paired_end: self.general_stats_data[s_name].update( self.bam_stat_data[s_name]) # Make dot plot of counts pconfig = {'id': 'rseqc_bam_stat'} keys = OrderedDict() defaults = { 'min': 0, 'shared_key': 'read_count', 'decimalPlaces': 2, 'modify': lambda x: float(x) / 1000000.0, } keys['total_records'] = dict(defaults, **{'title': 'Total records'}) keys['qc_failed'] = dict(defaults, **{'title': 'QC failed'}) keys['optical_pcr_duplicate'] = dict( defaults, **{ 'title': 'Duplicates', 'description': 'Optical/PCR duplicate' }) keys['non_primary_hits'] = dict(defaults, **{'title': 'Non primary hit'}) keys['unmapped_reads'] = dict( defaults, **{ 'title': 'Unmapped', 'description': 'Unmapped reads' }) keys['mapq_lt_mapq_cut_non'] = dict( defaults, **{ 'title': 'Non-unique', 'description': 'mapq < mapq_cut (non-unique)' }) keys['mapq_gte_mapq_cut_unique'] = dict( defaults, **{ 'title': 'Unique', 'description': 'mapq >= mapq_cut (unique)' }) if is_paired_end: keys['read_1'] = dict(defaults, **{'title': 'Read-1'}) keys['read_2'] = dict(defaults, **{'title': 'Read-2'}) keys['reads_map_to_sense'] = dict( defaults, **{ 'title': '+ve strand', 'description': "Reads map to '+'" }) keys['reads_map_to_antisense'] = dict( defaults, **{ 'title': '-ve strand', 'description': "Reads map to '-'" }) keys['non-splice_reads'] = dict(defaults, **{'title': 'Non-splice reads'}) keys['splice_reads'] = dict(defaults, **{'title': 'Splice reads'}) if is_paired_end: keys['reads_mapped_in_proper_pairs'] = dict( defaults, **{ 'title': 'Proper pairs', 'description': 'Reads mapped in proper pairs' }) keys['proper-paired_reads_map_to_different_chrom'] = dict( defaults, **{ 'title': 'Different chrom', 'description': 'Proper-paired reads map to different chrom' }) self.add_section(name='Bam Stat', anchor='rseqc-bam_stat', description='All numbers reported in millions.', plot=beeswarm.plot(self.bam_stat_data, keys, pconfig)) # Return number of samples found return len(self.bam_stat_data)
def parse_reports(self): """ Find bamtools stats reports and parse their data """ # Set up vars self.bamtools_stats_data = dict() regexes = { 'total_reads': r"Total reads:\s*(\d+)", 'mapped_reads': r"Mapped reads:\s*(\d+)", 'mapped_reads_pct': r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)", 'forward_strand': r"Forward strand:\s*(\d+)", 'forward_strand_pct': r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)", 'reverse_strand': r"Reverse strand:\s*(\d+)", 'reverse_strand_pct': r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)", 'failed_qc': r"Failed QC:\s*(\d+)", 'failed_qc_pct': r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)", 'duplicates': r"Duplicates:\s*(\d+)", 'duplicates_pct': r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)", 'paired_end': r"Paired-end reads:\s*(\d+)", 'paired_end_pct': r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)", 'proper_pairs': r"'Proper-pairs'\s*(\d+)", 'proper_pairs_pct': r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)", 'both_mapped': r"Both pairs mapped:\s*(\d+)", 'both_mapped_pct': r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)", 'read_1': r"Read 1:\s*(\d+)", 'read_2': r"Read 2:\s*(\d+)", 'singletons': r"Singletons:\s*(\d+)", 'singletons_pct': r"Singletons:\s*\d+\s+\(([\d\.]+)%\)", } # Go through files and parse data using regexes for f in self.find_log_files('bamtools/stats'): d = dict() for k, r in regexes.items(): r_search = re.search(r, f['f'], re.MULTILINE) if r_search: d[k] = float(r_search.group(1)) if len(d) > 0: if f['s_name'] in self.bamtools_stats_data: log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name'])) self.add_data_source(f, section='stats') self.bamtools_stats_data[f['s_name']] = d # Filter to strip out ignored sample names self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data) if len(self.bamtools_stats_data) > 0: # Write to file self.write_data_file(self.bamtools_stats_data, 'multiqc_bamtools_stats') # Add to general stats table self.general_stats_headers['duplicates_pct'] = { 'title': '% Duplicates', 'description': '% Duplicate Reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'OrRd' } self.general_stats_headers['mapped_reads_pct'] = { 'title': '% Mapped', 'description': '% Mapped Reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn' } for s_name in self.bamtools_stats_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.bamtools_stats_data[s_name] ) # Make dot plot of counts keys = OrderedDict() defaults = { 'min': 0, 'max': 100, 'decimalPlaces': 2, 'suffix': '%' } num_defaults = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'decimalPlaces': 2 } keys['total_reads'] = dict(num_defaults, **{'title': 'Total reads', 'description': 'Total reads (millions)' }); keys['mapped_reads_pct'] = dict(defaults, **{'title': 'Mapped reads' }) keys['forward_strand_pct'] = dict(defaults, **{'title': 'Forward strand' }) keys['reverse_strand_pct'] = dict(defaults, **{'title': 'Reverse strand' }) keys['failed_qc_pct'] = dict(defaults, **{'title': 'Failed QC' }) keys['duplicates_pct'] = dict(defaults, **{'title': 'Duplicates' }) keys['paired_end_pct'] = dict(defaults, **{'title': 'Paired-end', 'description': 'Paired-end reads' }) keys['proper_pairs_pct'] = dict(defaults, **{'title': 'Proper-pairs' }) keys['both_mapped_pct'] = dict(defaults, **{'title': 'Both mapped', 'description': 'Both pairs mapped' }) keys['read_1'] = dict(num_defaults, **{'title': 'Read 1', 'description': 'Read 1 (millions)' }); keys['read_2'] = dict(num_defaults, **{'title': 'Read 2', 'description': 'Read 2 (millions)' }); keys['singletons_pct'] = dict(defaults, **{'title': 'Singletons' }) self.add_section ( name = 'Bamtools Stats', anchor = 'bamtools-stats', plot = beeswarm.plot(self.bamtools_stats_data, keys) ) # Return number of samples found return len(self.bamtools_stats_data)
def parse_reports(self): """Find bamtools stats reports and parse their data""" # Set up vars self.bamtools_stats_data = dict() regexes = { "total_reads": r"Total reads:\s*(\d+)", "mapped_reads": r"Mapped reads:\s*(\d+)", "mapped_reads_pct": r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)", "forward_strand": r"Forward strand:\s*(\d+)", "forward_strand_pct": r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)", "reverse_strand": r"Reverse strand:\s*(\d+)", "reverse_strand_pct": r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)", "failed_qc": r"Failed QC:\s*(\d+)", "failed_qc_pct": r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)", "duplicates": r"Duplicates:\s*(\d+)", "duplicates_pct": r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)", "paired_end": r"Paired-end reads:\s*(\d+)", "paired_end_pct": r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)", "proper_pairs": r"'Proper-pairs'\s*(\d+)", "proper_pairs_pct": r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)", "both_mapped": r"Both pairs mapped:\s*(\d+)", "both_mapped_pct": r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)", "read_1": r"Read 1:\s*(\d+)", "read_2": r"Read 2:\s*(\d+)", "singletons": r"Singletons:\s*(\d+)", "singletons_pct": r"Singletons:\s*\d+\s+\(([\d\.]+)%\)", } # Go through files and parse data using regexes for f in self.find_log_files("bamtools/stats"): d = dict() for k, r in regexes.items(): r_search = re.search(r, f["f"], re.MULTILINE) if r_search: d[k] = float(r_search.group(1)) if len(d) > 0: if f["s_name"] in self.bamtools_stats_data: log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"])) self.add_data_source(f, section="stats") self.bamtools_stats_data[f["s_name"]] = d # Filter to strip out ignored sample names self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data) if len(self.bamtools_stats_data) > 0: # Write to file self.write_data_file(self.bamtools_stats_data, "multiqc_bamtools_stats") # Add to general stats table self.general_stats_headers["duplicates_pct"] = { "title": "% Duplicates", "description": "% Duplicate Reads", "max": 100, "min": 0, "suffix": "%", "scale": "OrRd", } self.general_stats_headers["mapped_reads_pct"] = { "title": "% Mapped", "description": "% Mapped Reads", "max": 100, "min": 0, "suffix": "%", "scale": "RdYlGn", } for s_name in self.bamtools_stats_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update(self.bamtools_stats_data[s_name]) # Make dot plot of counts keys = OrderedDict() defaults = {"min": 0, "max": 100, "decimalPlaces": 2, "suffix": "%"} num_defaults = {"min": 0, "modify": lambda x: float(x) / 1000000.0, "decimalPlaces": 2} keys["total_reads"] = dict(num_defaults, **{"title": "Total reads", "description": "Total reads (millions)"}) keys["mapped_reads_pct"] = dict(defaults, **{"title": "Mapped reads"}) keys["forward_strand_pct"] = dict(defaults, **{"title": "Forward strand"}) keys["reverse_strand_pct"] = dict(defaults, **{"title": "Reverse strand"}) keys["failed_qc_pct"] = dict(defaults, **{"title": "Failed QC"}) keys["duplicates_pct"] = dict(defaults, **{"title": "Duplicates"}) keys["paired_end_pct"] = dict(defaults, **{"title": "Paired-end", "description": "Paired-end reads"}) keys["proper_pairs_pct"] = dict(defaults, **{"title": "Proper-pairs"}) keys["both_mapped_pct"] = dict(defaults, **{"title": "Both mapped", "description": "Both pairs mapped"}) keys["bt_read_1"] = dict(num_defaults, **{"title": "Read 1", "description": "Read 1 (millions)"}) keys["bt_read_2"] = dict(num_defaults, **{"title": "Read 2", "description": "Read 2 (millions)"}) keys["singletons_pct"] = dict(defaults, **{"title": "Singletons"}) self.add_section( name="Bamtools Stats", anchor="bamtools-stats", plot=beeswarm.plot(self.bamtools_stats_data, keys) ) # Return number of samples found return len(self.bamtools_stats_data)
def main(): # # Usage statement # parseStr = 'Reads an excel spreadsheet of CIDR derived QC data and creates a multiqc-report like swarm plot.\n\n\ Usage:\n\ csi_cidr_stats.py -i qc_report_file -o output_html_file \n\n\ Example:\n\ csi_cidr_stats.py -i Holland_Release_Set_10_QC_Report.xlsx -o csi_cidr_stats.html\n' parser = argparse.ArgumentParser(description=parseStr, formatter_class=RawTextHelpFormatter) parser.add_argument( '-i', '--infile', required=True, nargs='?', type=argparse.FileType('r'), default=None, help= 'Input CIDR QC Report Excel file, e.g. "Holland_Release_Set_10_QC_Report.xlsx"' ) parser.add_argument( '-o', '--outfile', required=False, nargs='?', type=argparse.FileType('w'), default=None, help='Output QC Report HTML file, e.g. "Batch10_stats.html"') parser.add_argument('-t', '--testmode', required=False, action='store_true', default=False, help='Run in test mode') args = parser.parse_args() infile = args.infile outfile = args.outfile testmode = args.testmode ##################################### # # Set up the variables and the log file # ##################################### # Set up the log file thedate = str(datetime.datetime.now()).split()[0] thedate = re.sub("-", "", thedate) global log log = open('csi_cidr_stats' + '.log', 'a') log.write('\n' + str(datetime.datetime.now()) + '\n') log.write(' '.join(sys.argv) + '\n') log.write('csi_cidr_stats.py version ' + __version__ + '\n\n') log.flush() #################################### # # Import Excel file # #################################### theColumns = [ 'SM_TAG', 'VERIFYBAM_AVG_DP', 'TOTAL_READS', 'PCT_PF_READS_ALIGNED_PAIR', 'PF_HQ_ERROR_RATE_PAIR', 'PF_HQ_ALIGNED_Q20_BASES_PAIR', 'UNMAPPED_READS', 'MEAN_TARGET_COVERAGE', 'ZERO_CVG_TARGETS_PCT', 'PCT_EXC_MAPQ', 'PCT_EXC_BASEQ', 'PCT_TARGET_BASES_1X', 'PCT_TARGET_BASES_2X', 'PCT_TARGET_BASES_10X', 'PCT_TARGET_BASES_20X', 'PCT_TARGET_BASES_30X', 'PCT_TARGET_BASES_40X', 'PCT_TARGET_BASES_50X', 'PCT_TARGET_BASES_100X' ] qcDict = import_cidr_stats(infile, theColumns) plot = beeswarm.plot(qcDict) print(type(plot)) ###################################### # # Close out and clean up # ###################################### send_update("\ncsi_cidr_stats.py successfully completed", log) send_update(str(datetime.datetime.now()) + '\n', log) log.close()
def parse_samtools_stats(self): """ Find Samtools stats logs and parse their data """ self.samtools_stats = dict() for f in self.find_log_files('samtools/stats'): parsed_data = dict() for line in f['f'].splitlines(): if not line.startswith("SN"): continue sections = line.split("\t") field = sections[1].strip()[:-1] field = field.replace(' ', '_') value = float(sections[2].strip()) parsed_data[field] = value if len(parsed_data) > 0: # Work out some percentages if 'raw_total_sequences' in parsed_data: for k in list(parsed_data.keys()): if k.startswith('reads_') and k != 'raw_total_sequences' and parsed_data['raw_total_sequences'] > 0: parsed_data['{}_percent'.format(k)] = (parsed_data[k] / parsed_data['raw_total_sequences']) * 100 total_alignments = parsed_data['reads_mapped'] + parsed_data['non-primary_alignments'] if total_alignments > 0: parsed_data['non-primary_alignments_percent'] = (parsed_data['non-primary_alignments'] / total_alignments) * 100 if f['s_name'] in self.samtools_stats: log.debug("Duplicate sample name found! Overwriting: {}" .format(f['s_name'])) self.add_data_source(f, section='stats') self.samtools_stats[f['s_name']] = parsed_data # Filter to strip out ignored sample names self.samtools_stats = self.ignore_samples(self.samtools_stats) self.read_format = '{:,.1f} ' + config.read_count_prefix if config.read_count_multiplier == 1: self.read_format = '{:,.0f}' if len(self.samtools_stats) > 0: # Write parsed report data to a file self.write_data_file(self.samtools_stats, 'multiqc_samtools_stats') # General Stats Table stats_headers = OrderedDict() stats_headers['error_rate'] = { 'title': 'Error rate', 'description': 'Error rate: mismatches (NM) / bases mapped (CIGAR)', 'min': 0, 'max': 100, 'suffix': '%', 'scale': 'OrRd', 'format': '{:,.2f}', 'modify': lambda x: x * 100.0 } stats_headers['reads_mapped'] = { 'title': 'Mapped', 'description': 'Reads mapped in the bam file ({})'.format(config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count', 'format': self.read_format, } stats_headers['reads_mapped_percent'] = { 'title': 'Mapped', 'description': '% Mapped reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn' } stats_headers['reads_properly_paired_percent'] = { 'title': 'Pair', 'description': '% Properly paired reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn', 'hidden': True if (max([x['reads_mapped_and_paired'] for x in self.samtools_stats.values()]) == 0) else False } stats_headers['non-primary_alignments'] = { 'title': '2ry'.format(config.read_count_prefix), 'description': 'Non-primary alignments ({})'.format(config.read_count_desc), 'min': 0, 'scale': 'OrRd', 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count', 'format': self.read_format, } stats_headers['non-primary_alignments_percent'] = { 'title': '2ry'.format(config.read_count_prefix), 'description': '% Non-primary alignments', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'OrRd', } stats_headers['reads_MQ0_percent'] = { 'title': 'MQ0', 'description': '% Reads that are ambiguously placed (MQ=0)', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'OrRd', 'hidden': True } stats_headers['raw_total_sequences'] = { 'title': 'Reads'.format(config.read_count_prefix), 'description': 'Total sequences in the bam file ({})'.format(config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count', 'format': self.read_format, } self.general_stats_addcols(self.samtools_stats, stats_headers, 'Samtools Stats') # Make bargraph plot of mapped/unmapped reads self.alignment_section(self.samtools_stats) # Make dot plot of counts keys = OrderedDict() reads = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'M reads', 'decimalPlaces': 2, 'shared_key': 'read_count' } bases = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'M bases', 'decimalPlaces': 2, 'shared_key': 'base_count' } keys['raw_total_sequences'] = dict(reads, **{'title': 'Total sequences'}) keys['reads_mapped_and_paired'] = dict(reads, **{'title': 'Mapped & paired', 'description': 'Paired-end technology bit set + both mates mapped' }) keys['reads_properly_paired'] = dict(reads, **{'title': 'Properly paired', 'description': 'Proper-pair bit set'}) keys['reads_duplicated'] = dict(reads, **{'title': 'Duplicated', 'description': 'PCR or optical duplicate bit set'}) keys['reads_QC_failed'] = dict(reads, **{'title': 'QC Failed'}) keys['reads_MQ0'] = dict(reads, **{'title': 'Reads MQ0', 'description': 'Reads mapped and MQ=0'}) keys['bases_mapped_(cigar)'] = dict(bases, **{'title': 'Mapped bases (CIGAR)', 'description': 'Mapped bases (CIGAR)'}) keys['bases_trimmed'] = dict(bases, **{'title': 'Bases Trimmed'}) keys['bases_duplicated'] = dict(bases, **{'title': 'Duplicated bases'}) keys['pairs_on_different_chromosomes'] = dict(reads, **{'title': 'Diff chromosomes', 'description': 'Pairs on different chromosomes'}) keys['pairs_with_other_orientation'] = dict(reads, **{'title': 'Other orientation', 'description': 'Pairs with other orientation'}) keys['inward_oriented_pairs'] = dict(reads, **{'title': 'Inward pairs', 'description': 'Inward oriented pairs'}) keys['outward_oriented_pairs'] = dict(reads, **{'title': 'Outward pairs', 'description': 'Outward oriented pairs'}) self.add_section ( name = 'Alignment metrics', anchor = 'samtools-stats', description = "This module parses the output from <code>samtools stats</code>. All numbers in millions.", plot = beeswarm.plot(self.samtools_stats, keys, {'id': 'samtools-stats-dp'}) ) # Return the number of logs that were found return len(self.samtools_stats)
def beechart(self, s_name): print("gg", self.data2) return beeswarm.plot(self.data2)
def chart_qc_cv(self): ''' Charts _cv_table.txt Inputs: No inputs Returns: No returns, generates Sequencing Depth - Whole Genome chart ''' cats = [ ('all_base', 'a_b'), ('q40_base', 'q_b'), ('all_base_botgc', 'a_b_b'), ('q40_base_botgc', 'q_b_b'), ('all_base_topgc', 'a_b_t'), ('q40_base_topgc', 'q_b_t'), ('all_cpg', 'a_c'), ('q40_cpg', 'q_c'), ('all_cpg_botgc', 'a_c_b'), ('q40_cpg_botgc', 'q_c_b'), ('all_cpg_topgc', 'a_c_t'), ('q40_cpg_topgc', 'q_c_t') ] pd = OrderedDict() for s_name, dd in self.mdata['qc_cv'].items(): data = OrderedDict() for cat, key in cats: if cat in dd: if dd[cat]['mu'] != -1: data['mu_'+key] = dd[cat]['mu'] data['cv_'+key] = dd[cat]['cv'] if len(data) > 0: pd[s_name] = data shared_mean = {'min': 0, 'format': '{:,3f}', 'minRange': 10} shared_cofv = {'min': 0, 'format': '{:,3f}', 'minRange': 50} pheader = OrderedDict() pheader['mu_a_b'] = dict(shared_mean, **{'title': 'All Genome Mean', 'description': 'Mean Sequencing Depth for All Reads'}) pheader['mu_q_b'] = dict(shared_mean, **{'title': 'Q40 Genome Mean', 'description': 'Mean Sequencing Depth for Q40 Reads'}) pheader['mu_a_b_b'] = dict(shared_mean, **{'title': 'Low GC All Gen. Mean', 'description': 'Mean Sequencing Depth for All Reads in Low GC-Content Regions'}) pheader['mu_q_b_b'] = dict(shared_mean, **{'title': 'Low GC Q40 Gen. Mean', 'description': 'Mean Sequencing Depth for Q40 Reads in Low GC-Content Regions'}) pheader['mu_a_b_t'] = dict(shared_mean, **{'title': 'High GC All Gen. Mean', 'description': 'Mean Sequencing Depth for All Reads in High GC-Content Regions'}) pheader['mu_q_b_t'] = dict(shared_mean, **{'title': 'High GC Q40 Gen. Mean', 'description': 'Mean Sequencing Depth for Q40 Reads in High GC-Content Regions'}) pheader['cv_a_b'] = dict(shared_cofv, **{'title': 'All Genome CoV', 'description': 'Sequencing Depth CoV for All Reads'}) pheader['cv_q_b'] = dict(shared_cofv, **{'title': 'Q40 Genome CoV', 'description': 'Sequencing Depth CoV for Q40 Reads'}) pheader['cv_a_b_b'] = dict(shared_cofv, **{'title': 'Low GC All Gen. CoV', 'description': 'Sequencing Depth CoV for All Reads in Low GC-Content Regions'}) pheader['cv_q_b_b'] = dict(shared_cofv, **{'title': 'Low GC Q40 Gen. CoV', 'description': 'Sequencing Depth CoV for Q40 Reads in Low GC-Content Regions'}) pheader['cv_a_b_t'] = dict(shared_cofv, **{'title': 'High GC All Gen. CoV', 'description': 'Sequencing Depth CoV for All Reads in High GC-Content Regions'}) pheader['cv_q_b_t'] = dict(shared_cofv, **{'title': 'High GC Q40 Gen. CoV', 'description': 'Sequencing Depth CoV for Q40 Reads in High GC-Content Regions'}) pheader['mu_a_c'] = dict(shared_mean, **{'title': 'All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs'}) pheader['mu_q_c'] = dict(shared_mean, **{'title': 'Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs'}) pheader['mu_a_c_b'] = dict(shared_mean, **{'title': 'Low GC All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs in Low GC-Content Regions'}) pheader['mu_q_c_b'] = dict(shared_mean, **{'title': 'Low GC Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs in Low GC-Content Regions'}) pheader['mu_a_c_t'] = dict(shared_mean, **{'title': 'High GC All CpGs Mean', 'description': 'Mean Sequencing Depth for All CpGs in High GC-Content Regions'}) pheader['mu_q_c_t'] = dict(shared_mean, **{'title': 'High GC Q40 CpGs Mean', 'description': 'Mean Sequencing Depth for Q40 CpGs in High GC-Content Regions'}) pheader['cv_a_c'] = dict(shared_cofv, **{'title': 'All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs'}) pheader['cv_q_c'] = dict(shared_cofv, **{'title': 'Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs'}) pheader['cv_a_c_b'] = dict(shared_cofv, **{'title': 'Low GC All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs in Low GC-Content Regions'}) pheader['cv_q_c_b'] = dict(shared_cofv, **{'title': 'Low GC Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs in Low GC-Content Regions'}) pheader['cv_a_c_t'] = dict(shared_cofv, **{'title': 'High GC All CpGs CoV', 'description': 'Sequencing Depth CoV for All CpGs in High GC-Content Regions'}) pheader['cv_q_c_t'] = dict(shared_cofv, **{'title': 'High GC Q40 CpGs CoV', 'description': 'Sequencing Depth CoV for Q40 CpGs in High GC-Content Regions'}) pconfig = { 'id': 'biscuit_seq_depth', 'table_title': 'BISCUIT: Sequencing Depth', 'sortRows': False } if len(pd) > 0: self.add_section( name = 'Sequencing Depth Statistics', anchor = 'biscuit-seq-depth', description = ''' Shows the sequence depth mean and uniformity measured by the Coefficient of Variation (`CoV`, defined as `stddev/mean`). ''', helptext = ''' The plot shows coverage across different selections: * _Genome_ (Gen.) - Statistics for all bases across the entire genome * _CpGs_ - Statistics for CpGs * _All_ - Statistics for any mapped bases/CpGs * _Q40_ - Statistics only those bases/CpGs with mapping quality `MAPQ >= 40` * _High GC_ - Bases / CpGs that overlap with the top 10% of 100bp windows for GC-content * _Low GC_ - Bases / CpGs that overlap with the bottom 10% of 100bp windows for GC-content ''', plot = beeswarm.plot(pd, pheader, pconfig) )
def parse_reports(self): """ Find bamtools stats reports and parse their data """ # Set up vars self.bamtools_stats_data = dict() regexes = { 'total_reads': r"Total reads:\s*(\d+)", 'mapped_reads': r"Mapped reads:\s*(\d+)", 'mapped_reads_pct': r"Mapped reads:\s*\d+\s+\(([\d\.]+)%\)", 'forward_strand': r"Forward strand:\s*(\d+)", 'forward_strand_pct': r"Forward strand:\s*\d+\s+\(([\d\.]+)%\)", 'reverse_strand': r"Reverse strand:\s*(\d+)", 'reverse_strand_pct': r"Reverse strand:\s*\d+\s+\(([\d\.]+)%\)", 'failed_qc': r"Failed QC:\s*(\d+)", 'failed_qc_pct': r"Failed QC:\s*\d+\s+\(([\d\.]+)%\)", 'duplicates': r"Duplicates:\s*(\d+)", 'duplicates_pct': r"Duplicates:\s*\d+\s+\(([\d\.]+)%\)", 'paired_end': r"Paired-end reads:\s*(\d+)", 'paired_end_pct': r"Paired-end reads:\s*\d+\s+\(([\d\.]+)%\)", 'proper_pairs': r"'Proper-pairs'\s*(\d+)", 'proper_pairs_pct': r"'Proper-pairs'\s*\d+\s+\(([\d\.]+)%\)", 'both_mapped': r"Both pairs mapped:\s*(\d+)", 'both_mapped_pct': r"Both pairs mapped:\s*\d+\s+\(([\d\.]+)%\)", 'read_1': r"Read 1:\s*(\d+)", 'read_2': r"Read 2:\s*(\d+)", 'singletons': r"Singletons:\s*(\d+)", 'singletons_pct': r"Singletons:\s*\d+\s+\(([\d\.]+)%\)", } # Go through files and parse data using regexes for f in self.find_log_files('bamtools/stats'): d = dict() for k, r in regexes.items(): r_search = re.search(r, f['f'], re.MULTILINE) if r_search: d[k] = float(r_search.group(1)) if len(d) > 0: if f['s_name'] in self.bamtools_stats_data: log.debug( "Duplicate sample name found! Overwriting: {}".format( f['s_name'])) self.add_data_source(f, section='stats') self.bamtools_stats_data[f['s_name']] = d # Filter to strip out ignored sample names self.bamtools_stats_data = self.ignore_samples(self.bamtools_stats_data) if len(self.bamtools_stats_data) > 0: # Write to file self.write_data_file(self.bamtools_stats_data, 'multiqc_bamtools_stats') # Add to general stats table self.general_stats_headers['duplicates_pct'] = { 'title': '% Duplicates', 'description': '% Duplicate Reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'OrRd' } self.general_stats_headers['mapped_reads_pct'] = { 'title': '% Mapped', 'description': '% Mapped Reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'RdYlGn' } for s_name in self.bamtools_stats_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.bamtools_stats_data[s_name]) # Make dot plot of counts keys = OrderedDict() defaults = {'min': 0, 'max': 100, 'decimalPlaces': 2, 'suffix': '%'} num_defaults = { 'min': 0, 'modify': lambda x: float(x) / 1000000.0, 'decimalPlaces': 2 } keys['total_reads'] = dict( num_defaults, **{ 'title': 'Total reads', 'description': 'Total reads (millions)' }) keys['mapped_reads_pct'] = dict(defaults, **{'title': 'Mapped reads'}) keys['forward_strand_pct'] = dict(defaults, **{'title': 'Forward strand'}) keys['reverse_strand_pct'] = dict(defaults, **{'title': 'Reverse strand'}) keys['failed_qc_pct'] = dict(defaults, **{'title': 'Failed QC'}) keys['duplicates_pct'] = dict(defaults, **{'title': 'Duplicates'}) keys['paired_end_pct'] = dict( defaults, **{ 'title': 'Paired-end', 'description': 'Paired-end reads' }) keys['proper_pairs_pct'] = dict(defaults, **{'title': 'Proper-pairs'}) keys['both_mapped_pct'] = dict( defaults, **{ 'title': 'Both mapped', 'description': 'Both pairs mapped' }) keys['bt_read_1'] = dict( num_defaults, **{ 'title': 'Read 1', 'description': 'Read 1 (millions)' }) keys['bt_read_2'] = dict( num_defaults, **{ 'title': 'Read 2', 'description': 'Read 2 (millions)' }) keys['singletons_pct'] = dict(defaults, **{'title': 'Singletons'}) self.add_section(name='Bamtools Stats', anchor='bamtools-stats', plot=beeswarm.plot(self.bamtools_stats_data, keys)) # Return number of samples found return len(self.bamtools_stats_data)
def chart_qc_cv(self): """ Charts _cv_table.txt Inputs: No inputs Returns: No returns, generates Sequencing Depth - Whole Genome chart """ cats = [ ("all_base", "a_b"), ("q40_base", "q_b"), ("all_base_botgc", "a_b_b"), ("q40_base_botgc", "q_b_b"), ("all_base_topgc", "a_b_t"), ("q40_base_topgc", "q_b_t"), ("all_cpg", "a_c"), ("q40_cpg", "q_c"), ("all_cpg_botgc", "a_c_b"), ("q40_cpg_botgc", "q_c_b"), ("all_cpg_topgc", "a_c_t"), ("q40_cpg_topgc", "q_c_t"), ] pd = OrderedDict() for s_name, dd in self.mdata["qc_cv"].items(): data = OrderedDict() for cat, key in cats: if cat in dd: if dd[cat]["mu"] != -1: data["mu_" + key] = dd[cat]["mu"] data["cv_" + key] = dd[cat]["cv"] if len(data) > 0: pd[s_name] = data shared_mean = {"min": 0, "format": "{:,3f}", "minRange": 10} shared_cofv = {"min": 0, "format": "{:,3f}", "minRange": 50} pheader = OrderedDict() pheader["mu_a_b"] = dict( shared_mean, **{"title": "All Genome Mean", "description": "Mean Sequencing Depth for All Reads"} ) pheader["mu_q_b"] = dict( shared_mean, **{"title": "Q40 Genome Mean", "description": "Mean Sequencing Depth for Q40 Reads"} ) pheader["mu_a_b_b"] = dict( shared_mean, **{ "title": "Low GC All Gen. Mean", "description": "Mean Sequencing Depth for All Reads in Low GC-Content Regions", }, ) pheader["mu_q_b_b"] = dict( shared_mean, **{ "title": "Low GC Q40 Gen. Mean", "description": "Mean Sequencing Depth for Q40 Reads in Low GC-Content Regions", }, ) pheader["mu_a_b_t"] = dict( shared_mean, **{ "title": "High GC All Gen. Mean", "description": "Mean Sequencing Depth for All Reads in High GC-Content Regions", }, ) pheader["mu_q_b_t"] = dict( shared_mean, **{ "title": "High GC Q40 Gen. Mean", "description": "Mean Sequencing Depth for Q40 Reads in High GC-Content Regions", }, ) pheader["cv_a_b"] = dict( shared_cofv, **{"title": "All Genome CoV", "description": "Sequencing Depth CoV for All Reads"} ) pheader["cv_q_b"] = dict( shared_cofv, **{"title": "Q40 Genome CoV", "description": "Sequencing Depth CoV for Q40 Reads"} ) pheader["cv_a_b_b"] = dict( shared_cofv, **{ "title": "Low GC All Gen. CoV", "description": "Sequencing Depth CoV for All Reads in Low GC-Content Regions", }, ) pheader["cv_q_b_b"] = dict( shared_cofv, **{ "title": "Low GC Q40 Gen. CoV", "description": "Sequencing Depth CoV for Q40 Reads in Low GC-Content Regions", }, ) pheader["cv_a_b_t"] = dict( shared_cofv, **{ "title": "High GC All Gen. CoV", "description": "Sequencing Depth CoV for All Reads in High GC-Content Regions", }, ) pheader["cv_q_b_t"] = dict( shared_cofv, **{ "title": "High GC Q40 Gen. CoV", "description": "Sequencing Depth CoV for Q40 Reads in High GC-Content Regions", }, ) pheader["mu_a_c"] = dict( shared_mean, **{"title": "All CpGs Mean", "description": "Mean Sequencing Depth for All CpGs"} ) pheader["mu_q_c"] = dict( shared_mean, **{"title": "Q40 CpGs Mean", "description": "Mean Sequencing Depth for Q40 CpGs"} ) pheader["mu_a_c_b"] = dict( shared_mean, **{ "title": "Low GC All CpGs Mean", "description": "Mean Sequencing Depth for All CpGs in Low GC-Content Regions", }, ) pheader["mu_q_c_b"] = dict( shared_mean, **{ "title": "Low GC Q40 CpGs Mean", "description": "Mean Sequencing Depth for Q40 CpGs in Low GC-Content Regions", }, ) pheader["mu_a_c_t"] = dict( shared_mean, **{ "title": "High GC All CpGs Mean", "description": "Mean Sequencing Depth for All CpGs in High GC-Content Regions", }, ) pheader["mu_q_c_t"] = dict( shared_mean, **{ "title": "High GC Q40 CpGs Mean", "description": "Mean Sequencing Depth for Q40 CpGs in High GC-Content Regions", }, ) pheader["cv_a_c"] = dict( shared_cofv, **{"title": "All CpGs CoV", "description": "Sequencing Depth CoV for All CpGs"} ) pheader["cv_q_c"] = dict( shared_cofv, **{"title": "Q40 CpGs CoV", "description": "Sequencing Depth CoV for Q40 CpGs"} ) pheader["cv_a_c_b"] = dict( shared_cofv, **{ "title": "Low GC All CpGs CoV", "description": "Sequencing Depth CoV for All CpGs in Low GC-Content Regions", }, ) pheader["cv_q_c_b"] = dict( shared_cofv, **{ "title": "Low GC Q40 CpGs CoV", "description": "Sequencing Depth CoV for Q40 CpGs in Low GC-Content Regions", }, ) pheader["cv_a_c_t"] = dict( shared_cofv, **{ "title": "High GC All CpGs CoV", "description": "Sequencing Depth CoV for All CpGs in High GC-Content Regions", }, ) pheader["cv_q_c_t"] = dict( shared_cofv, **{ "title": "High GC Q40 CpGs CoV", "description": "Sequencing Depth CoV for Q40 CpGs in High GC-Content Regions", }, ) pconfig = {"id": "biscuit_seq_depth", "table_title": "BISCUIT: Sequencing Depth", "sortRows": False} if len(pd) > 0: self.add_section( name="Sequencing Depth Statistics", anchor="biscuit-seq-depth", description=""" Shows the sequence depth mean and uniformity measured by the Coefficient of Variation (`CoV`, defined as `stddev/mean`). """, helptext=""" The plot shows coverage across different selections: * _Genome_ (Gen.) - Statistics for all bases across the entire genome * _CpGs_ - Statistics for CpGs * _All_ - Statistics for any mapped bases/CpGs * _Q40_ - Statistics only those bases/CpGs with mapping quality `MAPQ >= 40` * _High GC_ - Bases / CpGs that overlap with the top 10% of 100bp windows for GC-content * _Low GC_ - Bases / CpGs that overlap with the bottom 10% of 100bp windows for GC-content """, plot=beeswarm.plot(pd, pheader, pconfig), )
def parse_samtools_flagstats(self): """ Find Samtools flagstat logs and parse their data """ self.samtools_flagstat = dict() for f in self.find_log_files("samtools/flagstat"): parsed_data = parse_single_report(f["f"]) if len(parsed_data) > 0: if f["s_name"] in self.samtools_flagstat: log.debug( "Duplicate sample name found! Overwriting: {}".format( f["s_name"])) self.add_data_source(f, section="flagstat") self.samtools_flagstat[f["s_name"]] = parsed_data # Filter to strip out ignored sample names self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat) if len(self.samtools_flagstat) > 0: # Write parsed report data to a file (restructure first) self.write_data_file(self.samtools_flagstat, "multiqc_samtools_flagstat") # General Stats Table flagstats_headers = dict() flagstats_headers["flagstat_total"] = { "title": "{} Reads".format(config.read_count_prefix), "description": "Total reads in the bam file ({})".format( config.read_count_desc), "min": 0, "modify": lambda x: x * config.read_count_multiplier, "shared_key": "read_count", "placement": 100.0, "hidden": True, } flagstats_headers["mapped_passed"] = { "title": "{} Reads Mapped".format(config.read_count_prefix), "description": "Reads Mapped in the bam file ({})".format( config.read_count_desc), "min": 0, "modify": lambda x: x * config.read_count_multiplier, "shared_key": "read_count", "placement": 101.0, } self.general_stats_addcols(self.samtools_flagstat, flagstats_headers) # Make dot plot of counts keys = OrderedDict() reads = { "min": 0, "modify": lambda x: float(x) * config.read_count_multiplier, "suffix": "{} reads".format(config.read_count_prefix), "decimalPlaces": 2, "shared_key": "read_count", } keys["flagstat_total"] = dict(reads, title="Total Reads") keys["total_passed"] = dict(reads, title="Total Passed QC") keys["mapped_passed"] = dict(reads, title="Mapped") if any( v.get("secondary_passed") for v in self.samtools_flagstat.values()): keys["secondary_passed"] = dict(reads, title="Secondary Alignments") if any( v.get("supplementary_passed") for v in self.samtools_flagstat.values()): keys["supplementary_passed"] = dict( reads, title="Supplementary Alignments") keys["duplicates_passed"] = dict(reads, title="Duplicates") keys["paired in sequencing_passed"] = dict( reads, title="Paired in Sequencing") keys["properly paired_passed"] = dict(reads, title="Properly Paired") keys["with itself and mate mapped_passed"] = dict( reads, title="Self and mate mapped", description="Reads with itself and mate mapped") keys["singletons_passed"] = dict(reads, title="Singletons") keys["with mate mapped to a different chr_passed"] = dict( reads, title="Mate mapped to diff chr", description="Mate mapped to different chromosome") keys["with mate mapped to a different chr (mapQ >= 5)_passed"] = dict( reads, title="Diff chr (mapQ >= 5)", description="Mate mapped to different chromosome (mapQ >= 5)") self.add_section( name="Samtools Flagstat", anchor="samtools-flagstat", description= "This module parses the output from <code>samtools flagstat</code>. All numbers in millions.", plot=beeswarm.plot(self.samtools_flagstat, keys, {"id": "samtools-flagstat-dp"}), ) # Return the number of logs that were found return len(self.samtools_flagstat)
def parse_samtools_flagstats(self): """ Find Samtools flagstat logs and parse their data """ self.samtools_flagstat = dict() for f in self.find_log_files('samtools/flagstat'): parsed_data = parse_single_report(f['f']) if len(parsed_data) > 0: if f['s_name'] in self.samtools_flagstat: log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name'])) self.add_data_source(f, section='flagstat') self.samtools_flagstat[f['s_name']] = parsed_data # Filter to strip out ignored sample names self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat) if len(self.samtools_flagstat) > 0: # Write parsed report data to a file (restructure first) self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat') # General Stats Table flagstats_headers = dict() flagstats_headers['mapped_passed'] = { 'title': '{} Reads Mapped'.format(config.read_count_prefix), 'description': 'Reads Mapped in the bam file ({})'.format(config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count', 'placement' : 100.0 } self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat') # Make dot plot of counts keys = OrderedDict() reads = { 'min': 0, 'modify': lambda x: float(x) * config.read_count_multiplier, 'suffix': '{} reads'.format(config.read_count_prefix), 'decimalPlaces': 2, 'shared_key': 'read_count' } keys['flagstat_total'] = dict(reads, title = 'Total Reads' ) keys['total_passed'] = dict(reads, title = 'Total Passed QC' ) keys['mapped_passed'] = dict(reads, title = 'Mapped' ) if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()): keys['secondary_passed'] = dict(reads, title = 'Secondary Alignments' ) if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()): keys['supplementary_passed'] = dict(reads, title = 'Supplementary Alignments' ) keys['duplicates_passed'] = dict(reads, title = 'Duplicates' ) keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' ) keys['properly paired_passed'] = dict(reads, title = 'Properly Paired' ) keys['with itself and mate mapped_passed'] = \ dict(reads, title = 'Self and mate mapped', description = 'Reads with itself and mate mapped' ) keys['singletons_passed'] = dict(reads, title = 'Singletons' ) keys['with mate mapped to a different chr_passed'] = \ dict(reads, title = 'Mate mapped to diff chr', description = 'Mate mapped to different chromosome' ) keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \ dict(reads, title = 'Diff chr (mapQ >= 5)', description = 'Mate mapped to different chromosome (mapQ >= 5)' ) self.add_section ( name = 'Samtools Flagstat', anchor = 'samtools-flagstat', description = 'This module parses the output from <code>samtools flagstat</code>. All numbers in millions.', plot = beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'}) ) # Return the number of logs that were found return len(self.samtools_flagstat)
def parse_reports(self): """ Find RSeQC bam_stat reports and parse their data """ # Set up vars self.bam_stat_data = dict() regexes = { "total_records": r"Total records:\s*(\d+)", "qc_failed": r"QC failed:\s*(\d+)", "optical_pcr_duplicate": r"Optical/PCR duplicate:\s*(\d+)", "non_primary_hits": r"Non primary hits\s*(\d+)", "unmapped_reads": r"Unmapped reads:\s*(\d+)", "mapq_lt_mapq_cut_non-unique": r"mapq < mapq_cut \(non-unique\):\s*(\d+)", "mapq_gte_mapq_cut_unique": r"mapq >= mapq_cut \(unique\):\s*(\d+)", "read_1": r"Read-1:\s*(\d+)", "read_2": r"Read-2:\s*(\d+)", "reads_map_to_sense": r"Reads map to '\+':\s*(\d+)", "reads_map_to_antisense": r"Reads map to '-':\s*(\d+)", "non-splice_reads": r"Non-splice reads:\s*(\d+)", "splice_reads": r"Splice reads:\s*(\d+)", "reads_mapped_in_proper_pairs": r"Reads mapped in proper pairs:\s*(\d+)", "proper-paired_reads_map_to_different_chrom": r"Proper-paired reads map to different chrom:\s*(\d+)", } # intiate PE check is_paired_end = False # Go through files and parse data using regexes for f in self.find_log_files("rseqc/bam_stat"): d = dict() for k, r in regexes.items(): r_search = re.search(r, f["f"], re.MULTILINE) if r_search: d[k] = int(r_search.group(1)) # Calculate some percentages if "total_records" in d: t = float(d["total_records"]) if "mapq_gte_mapq_cut_unique" in d: d["unique_percent"] = (float(d["mapq_gte_mapq_cut_unique"]) / t) * 100.0 if "reads_mapped_in_proper_pairs" in d: d["proper_pairs_percent"] = (float(d["reads_mapped_in_proper_pairs"]) / t) * 100.0 if len(d) > 0: if f["s_name"] in self.bam_stat_data: log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"])) self.add_data_source(f, section="bam_stat") # Check if SE or PE if d["read_2"] != 0: is_paired_end = True self.bam_stat_data[f["s_name"]] = d # Filter to strip out ignored sample names self.bam_stat_data = self.ignore_samples(self.bam_stat_data) if len(self.bam_stat_data) > 0: # Write to file self.write_data_file(self.bam_stat_data, "multiqc_rseqc_bam_stat") # Add to general stats table self.general_stats_headers["proper_pairs_percent"] = { "title": "% Proper Pairs", "description": "% Reads mapped in proper pairs", "max": 100, "min": 0, "suffix": "%", "scale": "RdYlGn", } for s_name in self.bam_stat_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() # Only write if PE, i.e. there is something to write if is_paired_end: self.general_stats_data[s_name].update(self.bam_stat_data[s_name]) # Make dot plot of counts pconfig = {"id": "rseqc_bam_stat"} keys = OrderedDict() defaults = { "min": 0, "shared_key": "read_count", "decimalPlaces": 2, "modify": lambda x: float(x) / 1000000.0, } keys["total_records"] = dict(defaults, **{"title": "Total records"}) keys["qc_failed"] = dict(defaults, **{"title": "QC failed"}) keys["optical_pcr_duplicate"] = dict( defaults, **{"title": "Duplicates", "description": "Optical/PCR duplicate"} ) keys["non_primary_hits"] = dict(defaults, **{"title": "Non primary hit"}) keys["unmapped_reads"] = dict(defaults, **{"title": "Unmapped", "description": "Unmapped reads"}) keys["mapq_lt_mapq_cut_non"] = dict( defaults, **{"title": "Non-unique", "description": "mapq < mapq_cut (non-unique)"} ) keys["mapq_gte_mapq_cut_unique"] = dict( defaults, **{"title": "Unique", "description": "mapq >= mapq_cut (unique)"} ) if is_paired_end: keys["read_1"] = dict(defaults, **{"title": "Read-1"}) keys["read_2"] = dict(defaults, **{"title": "Read-2"}) keys["reads_map_to_sense"] = dict(defaults, **{"title": "+ve strand", "description": "Reads map to '+'"}) keys["reads_map_to_antisense"] = dict(defaults, **{"title": "-ve strand", "description": "Reads map to '-'"}) keys["non-splice_reads"] = dict(defaults, **{"title": "Non-splice reads"}) keys["splice_reads"] = dict(defaults, **{"title": "Splice reads"}) if is_paired_end: keys["reads_mapped_in_proper_pairs"] = dict( defaults, **{"title": "Proper pairs", "description": "Reads mapped in proper pairs"} ) keys["proper-paired_reads_map_to_different_chrom"] = dict( defaults, **{"title": "Different chrom", "description": "Proper-paired reads map to different chrom"} ) self.add_section( name="Bam Stat", anchor="rseqc-bam_stat", description="All numbers reported in millions.", plot=beeswarm.plot(self.bam_stat_data, keys, pconfig), ) # Return number of samples found return len(self.bam_stat_data)
def parse_samtools_flagstats(self): """ Find Samtools flagstat logs and parse their data """ self.samtools_flagstat = dict() # for f in self.find_log_files('mapping/flagstats'): for f in glob('mapping/flagstats/*.tsv'): print("Made it here:"+f) parsed_data = parse_single_report(open(f)) if len(parsed_data) > 0: if f['s_name'] in self.samtools_flagstat: log.debug("Duplicate sample name found! Overwriting: {}".format(f['s_name'])) self.add_data_source(f, section='flagstat') self.samtools_flagstat[f['s_name']] = parsed_data # Filter to strip out ignored sample names self.samtools_flagstat = self.ignore_samples(self.samtools_flagstat) if len(self.samtools_flagstat) > 0: # Write parsed report data to a file (restructure first) self.write_data_file(self.samtools_flagstat, 'multiqc_samtools_flagstat') # General Stats Table flagstats_headers = dict() flagstats_headers['mapped_passed'] = { 'title': '{} Reads Mapped'.format(config.read_count_prefix), 'description': 'Reads Mapped in the bam file ({})'.format(config.read_count_desc), 'min': 0, 'modify': lambda x: x * config.read_count_multiplier, 'shared_key': 'read_count', 'placement' : 100.0 } self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, 'Samtools Flagstat') # Make dot plot of counts keys = OrderedDict() reads = { 'min': 0, 'modify': lambda x: float(x) * config.read_count_multiplier, 'suffix': '{} reads'.format(config.read_count_prefix), 'decimalPlaces': 2, 'shared_key': 'read_count' } keys['flagstat_total'] = dict(reads, title = 'Total Reads' ) keys['total_passed'] = dict(reads, title = 'Total Passed QC' ) keys['mapped_passed'] = dict(reads, title = 'Mapped' ) if any(v.get('secondary_passed') for v in self.samtools_flagstat.values()): keys['secondary_passed'] = dict(reads, title = 'Secondary Alignments' ) if any(v.get('supplementary_passed') for v in self.samtools_flagstat.values()): keys['supplementary_passed'] = dict(reads, title = 'Supplementary Alignments' ) keys['duplicates_passed'] = dict(reads, title = 'Duplicates' ) keys['paired in sequencing_passed'] = dict(reads, title = 'Paired in Sequencing' ) keys['properly paired_passed'] = dict(reads, title = 'Properly Paired' ) keys['with itself and mate mapped_passed'] = \ dict(reads, title = 'Self and mate mapped', description = 'Reads with itself and mate mapped' ) keys['singletons_passed'] = dict(reads, title = 'Singletons' ) keys['with mate mapped to a different chr_passed'] = \ dict(reads, title = 'Mate mapped to diff chr', description = 'Mate mapped to different chromosome' ) keys['with mate mapped to a different chr (mapQ >= 5)_passed'] = \ dict(reads, title = 'Diff chr (mapQ >= 5)', description = 'Mate mapped to different chromosome (mapQ >= 5)' ) self.add_section ( name = 'Samtools Flagstat', anchor = 'samtools-flagstat', description = 'This module parses the output from <code>samtools flagstat</code>. All numbers in millions.', plot = beeswarm.plot(self.samtools_flagstat, keys, {'id': 'samtools-flagstat-dp'}) ) # Return the number of logs that were found return len(self.samtools_flagstat)