def bcbio_mirna_stats(self): bcbio_data = list() fns = self.find_log_files('bcbio/seqbuster') mirs_data = defaultdict(dict) mirs_key = OrderedDict() iso_data = defaultdict(dict) iso_key = OrderedDict() for f in fns: s_name = self.clean_s_name(f['fn'], root=None) with open(os.path.join(f['root'], f['fn'])) as in_handle: for line in in_handle: cols = line.strip().split() if line.startswith("mirs_"): mirs_key[cols[0]] = {'name': cols[0].replace("_", " ")} mirs_data[s_name][cols[0]] = int(cols[1]) if line.startswith("iso_"): iso_key[cols[0]] = {'name': cols[0].replace("_", " ")} iso_data[s_name][cols[0]] = int(cols[1]) self.write_data_file(mirs_data, "seqbuster_mirs") self.write_data_file(iso_data, "seqbuster_isomirs") if mirs_data: cnfg = {'ylab': '# of miRNAs'} cnfg['title'] = "Number of miRNAs with changes" self.mirs = bargraph.plot(mirs_data, mirs_key, cnfg) if iso_data: cnfg = {'ylab': '# of isomiRs'} cnfg['title'] = "Number of isomiRs with changes" self.iso = bargraph.plot(iso_data, iso_key, cnfg)
def bowtie2_alignment_plot (self): """ Make the HighCharts HTML to plot the alignment rates """ half_warning = '' for s_name in self.bowtie2_data: if 'paired_aligned_mate_one_halved' in self.bowtie2_data[s_name] or 'paired_aligned_mate_multi_halved' in self.bowtie2_data[s_name] or 'paired_aligned_mate_none_halved' in self.bowtie2_data[s_name]: half_warning = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>' description_text = 'This plot shows the number of reads aligning to the reference in different ways.' # Config for the plot config = { 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } # Two plots, don't mix SE with PE if self.num_se > 0: sekeys = OrderedDict() sekeys['unpaired_aligned_one'] = { 'color': '#20568f', 'name': 'SE mapped uniquely' } sekeys['unpaired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'SE multimapped' } sekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'SE not aligned' } config['id'] = 'bowtie2_se_plot' config['title'] = 'Bowtie 2: SE Alignment Scores' self.add_section( description = description_text, helptext = ''' There are 3 possible types of alignment: * **SE Mapped uniquely**: Read has only one occurence in the reference genome. * **SE Multimapped**: Read has multiple occurence. * **SE No aligned**: Read has no occurence. ''', plot = bargraph.plot(self.bowtie2_data, sekeys, config) ) if self.num_pe > 0: pekeys = OrderedDict() pekeys['paired_aligned_one'] = { 'color': '#20568f', 'name': 'PE mapped uniquely' } pekeys['paired_aligned_discord_one'] = { 'color': '#5c94ca', 'name': 'PE mapped discordantly uniquely' } pekeys['paired_aligned_mate_one_halved'] = { 'color': '#95ceff', 'name': 'PE one mate mapped uniquely' } pekeys['paired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'PE multimapped' } pekeys['paired_aligned_discord_multi'] = { 'color': '#dce333', 'name': 'PE discordantly multimapped' } pekeys['paired_aligned_mate_multi_halved'] = { 'color': '#ffeb75', 'name': 'PE one mate multimapped' } pekeys['paired_aligned_mate_none_halved'] = { 'color': '#981919', 'name': 'PE neither mate aligned' } config['id'] = 'bowtie2_pe_plot' config['title'] = 'Bowtie 2: PE Alignment Scores' self.add_section( description = "<br>".join([description_text,half_warning]), helptext = ''' There are 6 possible types of alignment: * **PE mapped uniquely**: Pair has only one occurence in the reference genome. * **PE mapped discordantly uniquely**: Pair has only one occurence but not in proper pair. * **PE one mate mapped uniquely**: One read of a pair has one occurence. * **PE multimapped**: Pair has multiple occurence. * **PE one mate multimapped**: One read of a pair has multiple occurence. * **PE neither mate aligned**: Pair has no occurence. ''', plot = bargraph.plot(self.bowtie2_data, pekeys, config) )
def hisat2_alignment_plot (self): """ Make the HighCharts HTML to plot the alignment rates """ # Split the data into SE and PE sedata = {} pedata = {} for s_name, data in self.hisat2_data.items(): if 'paired_total' in data: # Save half 'pairs' of mate counts m_keys = ['unpaired_total', 'unpaired_aligned_none', 'unpaired_aligned_one', 'unpaired_aligned_multi'] for k in m_keys: if k in data: data[k] = float(data[k]) / 2.0 pedata[s_name] = data else: sedata[s_name] = data # Two plots, don't mix SE with PE if len(sedata) > 0: sekeys = OrderedDict() sekeys['unpaired_aligned_one'] = { 'color': '#20568f', 'name': 'SE mapped uniquely' } sekeys['unpaired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'SE multimapped' } sekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'SE not aligned' } pconfig = { 'id': 'hisat2_se_plot', 'title': 'HISAT2: SE Alignment Scores', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } self.add_section( plot = bargraph.plot(sedata, sekeys, pconfig) ) if len(pedata) > 0: pekeys = OrderedDict() pekeys['paired_aligned_one'] = { 'color': '#20568f', 'name': 'PE mapped uniquely' } pekeys['paired_aligned_discord_one'] = { 'color': '#5c94ca', 'name': 'PE mapped discordantly uniquely' } pekeys['unpaired_aligned_one'] = { 'color': '#95ceff', 'name': 'PE one mate mapped uniquely' } pekeys['paired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'PE multimapped' } pekeys['unpaired_aligned_multi'] = { 'color': '#ffeb75', 'name': 'PE one mate multimapped' } pekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'PE neither mate aligned' } pconfig = { 'id': 'hisat2_pe_plot', 'title': 'HISAT2: PE Alignment Scores', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } self.add_section( description = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>', plot = bargraph.plot(pedata, pekeys, pconfig) )
def bbt_simple_plot(self): """ Makes a simple bar plot with summed alignment counts for each species, stacked. """ # First, sum the different types of alignment counts data = OrderedDict() cats = OrderedDict() for s_name in self.bbt_data: data[s_name] = OrderedDict() for org in self.bbt_data[s_name]: data[s_name][org] = self.bbt_data[s_name][org]['hits'] - self.bbt_data[s_name][org]['shared'] if org not in cats and org != 'multiMatch' and org != 'noMatch': if org.lower().endswith('.fa'): cname = org[:-3] elif org.lower().endswith('.fasta'): cname = org[:-6] else: cname = org cats[org] = { 'name': cname } pconfig = { 'id': 'biobloom_tools', 'title': 'BioBloom Tools: Alignment counts per species', 'ylab': 'Number of hits', 'hide_zero_cats': False } cats['multiMatch'] = { 'name': 'Multiple Genomes', 'color': '#820000' } cats['noMatch'] = { 'name': 'No Match', 'color': '#cccccc' } return bargraph.plot(data, cats, pconfig)
def summary_plot(data): """Barplot of combined pairs""" cats = OrderedDict() cats = { 'inniepairs': { 'name': 'Combined innie pairs', 'color': '#191970' }, 'outiepairs': { 'name': 'Combined outie pairs', 'color': '#00A08A' }, 'uncombopairs': { 'name': 'Uncombined pairs', 'color': '#cd1076' }, 'discardpairs': { 'name': 'Discarded pairs', 'color': '#ffd700' } } splotconfig = {'id': 'flash_combo_stats_plot', 'title': 'FLASh: Read combination statistics', 'ylab': 'Number of read pairs', 'hide_zero_cats': False } return bargraph.plot(data, cats, splotconfig)
def tag_info_chart (self): """ Make the taginfo.txt plot """ ## TODO: human chrs on hg19. How will this work with GRCh genome or other, non human, genomes? # nice if they are ordered by size ucsc = ["chr" + str(i) for i in range(1,23)].append([ "chrX", "chrY", "chrM"]) ensembl = list(range(1,23)).append([ "X", "Y", "MT"]) pconfig = { 'id': 'tagInfo', 'title': 'Homer: Tag Info Distribution', 'ylab': 'Tags', 'cpswitch_counts_label': 'Number of Tags' } ## check if chromosomes starts with "chr" (UCSC) or "#" (ensembl) sample1 = next(iter(self.tagdir_data['taginfo_total'])) chrFormat = next(iter(self.tagdir_data['taginfo_total'][sample1])) if ("chr" in chrFormat): chrs = ucsc else: chrs = ensembl return bargraph.plot(self.tagdir_data['taginfo_total'], chrs, pconfig)
def theta2_purities_chart (self): """ Make the plot showing alignment rates """ # Specify the order of the different possible categories keys = OrderedDict() keys['proportion_germline'] = { 'name': 'Germline' } keys['proportion_tumour_1'] = { 'name': 'Tumour Subclone 1' } keys['proportion_tumour_2'] = { 'name': 'Tumour Subclone 2' } keys['proportion_tumour_3'] = { 'name': 'Tumour Subclone 3' } keys['proportion_tumour_4'] = { 'name': 'Tumour Subclone 4' } keys['proportion_tumour_5'] = { 'name': 'Tumour Subclone 5' } keys['proportion_tumour_gt5'] = { 'name': 'Tumour Subclones > 5' } # Config for the plot pconfig = { 'id': 'theta2_purity_plot', 'title': 'THetA2: Tumour Subclone Purities', 'cpswitch': False, 'ymin': 0, 'ymax': 100, 'ylab': '% Purity', 'tt_suffix': '%' } return bargraph.plot(self.theta2_data, keys, pconfig)
def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if modname == '' or modname is None: modname = 'Custom Content' # Initialise the parent object super(MultiqcModule, self).__init__( name = modname, anchor = mod['config'].get('section_anchor', c_id), href = mod['config'].get('section_href'), info = mod['config'].get('description') ) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') self.add_section( plot = table.plot(mod['data'], headers, pconfig) ) self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) ) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) ) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.add_section( plot = linegraph.plot(mod['data'], pconfig) ) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.add_section( plot = scatter.plot(mod['data'], pconfig) ) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) ) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.add_section( plot = beeswarm.plot(mod['data'], pconfig) ) # Raw HTML elif mod['config'].get('plot_type') == 'html': self.add_section( content = mod['data'] ) # Raw image file as html elif mod['config'].get('plot_type') == 'image': self.add_section( content = mod['data'] ) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
def star_genecount_chart (self): """ Make a plot for the ReadsPerGene output """ # Specify the order of the different possible categories keys = OrderedDict() keys['N_genes'] = { 'color': '#2f7ed8', 'name': 'Overlapping Genes' } keys['N_noFeature'] = { 'color': '#0d233a', 'name': 'No Feature' } keys['N_ambiguous'] = { 'color': '#492970', 'name': 'Ambiguous Features' } keys['N_multimapping'] = { 'color': '#f28f43', 'name': 'Multimapping' } keys['N_unmapped'] = { 'color': '#7f0000', 'name': 'Unmapped' } # Config for the plot pconfig = { 'id': 'star_gene_counts', 'title': 'STAR: Gene Counts', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads', 'data_labels': ['Unstranded','Same Stranded','Reverse Stranded'] } datasets = [ self.star_genecounts_unstranded, self.star_genecounts_first_strand, self.star_genecounts_second_strand ] return bargraph.plot(datasets, [keys,keys,keys,keys], pconfig)
def hicup_truncating_chart (self): """ Generate the HiCUP Truncated reads plot """ # Specify the order of the different possible categories keys = OrderedDict() keys['Not_Truncated_Reads'] = { 'color': '#2f7ed8', 'name': 'Not Truncated' } keys['Truncated_Read'] = { 'color': '#0d233a', 'name': 'Truncated' } # Construct a data structure for the plot - duplicate the samples for read 1 and read 2 data = {} for s_name in self.hicup_data: data['{} Read 1'.format(s_name)] = {} data['{} Read 2'.format(s_name)] = {} data['{} Read 1'.format(s_name)]['Not_Truncated_Reads'] = self.hicup_data[s_name]['Not_Truncated_Reads_1'] data['{} Read 2'.format(s_name)]['Not_Truncated_Reads'] = self.hicup_data[s_name]['Not_Truncated_Reads_2'] data['{} Read 1'.format(s_name)]['Truncated_Read'] = self.hicup_data[s_name]['Truncated_Read_1'] data['{} Read 2'.format(s_name)]['Truncated_Read'] = self.hicup_data[s_name]['Truncated_Read_2'] # Config for the plot config = { 'id': 'hicup_truncated_reads_plot', 'title': 'HiCUP: Truncated Reads', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } return bargraph.plot(data, keys, config)
def mirtrace_contamination_check(self): """ Generate the miRTrace Contamination Check""" # A library of 24 colors. Should be enough for this plot color_lib = ['rgb(166,206,227)', 'rgb(31,120,180)', 'rgb(178,223,138)', 'rgb(51,160,44)', 'rgb(251,154,153)', 'rgb(227,26,28)', 'rgb(253,191,111)', 'rgb(255,127,0)', 'rgb(202,178,214)', 'rgb(106,61,154)', 'rgb(255,255,153)', 'rgb(177,89,40)', 'rgb(141,211,199)', 'rgb(255,255,179)', 'rgb(190,186,218)', 'rgb(251,128,114)', 'rgb(128,177,211)', 'rgb(253,180,98)', 'rgb(179,222,105)', 'rgb(252,205,229)', 'rgb(217,217,217)', 'rgb(188,128,189)', 'rgb(204,235,197)', 'rgb(255,237,111)'] idx = 0 # Specify the order of the different possible categories keys = OrderedDict() for clade in self.contamination_data[list(self.contamination_data.keys())[0]]: keys[clade] = { 'color': color_lib[idx], 'name': clade } if idx < 23: idx += 1 else: idx = 0 # Config for the plot config = { 'cpswitch_c_active': False, 'id': 'mirtrace_contamination_check_plot', 'title': 'miRTrace: Contamination Check', 'ylab': '# miRNA detected', 'cpswitch_counts_label': 'Number of detected miRNA' } return bargraph.plot(self.contamination_data, keys, config)
def hicup_alignment_chart (self): """ Generate the HiCUP Aligned reads plot """ # Specify the order of the different possible categories keys = OrderedDict() keys['Unique_Alignments_Read'] = { 'color': '#2f7ed8', 'name': 'Unique Alignments' } keys['Multiple_Alignments_Read'] = { 'color': '#492970', 'name': 'Multiple Alignments' } keys['Failed_To_Align_Read'] = { 'color': '#0d233a', 'name': 'Failed To Align' } keys['Too_Short_To_Map_Read'] = { 'color': '#f28f43', 'name': 'Too short to map' } # Construct a data structure for the plot - duplicate the samples for read 1 and read 2 data = {} for s_name in self.hicup_data: data['{} Read 1'.format(s_name)] = {} data['{} Read 2'.format(s_name)] = {} data['{} Read 1'.format(s_name)]['Unique_Alignments_Read'] = self.hicup_data[s_name]['Unique_Alignments_Read_1'] data['{} Read 2'.format(s_name)]['Unique_Alignments_Read'] = self.hicup_data[s_name]['Unique_Alignments_Read_2'] data['{} Read 1'.format(s_name)]['Multiple_Alignments_Read'] = self.hicup_data[s_name]['Multiple_Alignments_Read_1'] data['{} Read 2'.format(s_name)]['Multiple_Alignments_Read'] = self.hicup_data[s_name]['Multiple_Alignments_Read_2'] data['{} Read 1'.format(s_name)]['Failed_To_Align_Read'] = self.hicup_data[s_name]['Failed_To_Align_Read_1'] data['{} Read 2'.format(s_name)]['Failed_To_Align_Read'] = self.hicup_data[s_name]['Failed_To_Align_Read_2'] data['{} Read 1'.format(s_name)]['Too_Short_To_Map_Read'] = self.hicup_data[s_name]['Too_Short_To_Map_Read_1'] data['{} Read 2'.format(s_name)]['Too_Short_To_Map_Read'] = self.hicup_data[s_name]['Too_Short_To_Map_Read_2'] # Config for the plot config = { 'id': 'hicup_mapping_stats_plot', 'title': 'HiCUP: Mapping Statistics', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } return bargraph.plot(data, keys, config)
def rsem_mapped_reads_plot(self): """ Make the rsem assignment rates plot """ # Plot categories keys = OrderedDict() keys['Unique'] = { 'color': '#437bb1', 'name': 'Aligned uniquely to a gene' } keys['Multi'] = { 'color': '#e63491', 'name': 'Aligned to multiple genes' } keys['Filtered'] = { 'color': '#b1084c', 'name': 'Filtered due to too many alignments' } keys['Unalignable'] = { 'color': '#7f0000', 'name': 'Unalignable reads' } # Config for the plot config = { 'id': 'rsem_assignment_plot', 'title': 'RSEM: Mapped reads', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads', 'hide_zero_cats': False } self.add_section( name = 'Mapped Reads', anchor = 'rsem_mapped_reads', description = 'A breakdown of how all reads were aligned for each sample.', plot = bargraph.plot(self.rsem_mapped_data, keys, config) )
def bowtie_alignment_plot (self): """ Make the HighCharts HTML to plot the alignment rates """ # Specify the order of the different possible categories keys = OrderedDict() keys['reads_aligned'] = { 'color': '#8bbc21', 'name': 'Aligned' } keys['multimapped'] = { 'color': '#2f7ed8', 'name': 'Multimapped' } keys['not_aligned'] = { 'color': '#0d233a', 'name': 'Not aligned' } # Config for the plot config = { 'id': 'bowtie1_alignment', 'title': 'Bowtie 1: Alignment Scores', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } self.add_section( description = 'This plot shows the number of reads aligning to the reference in different ways.', helptext = ''' There are 3 possible types of alignment: * **Aligned**: Read has only one occurence in the reference genome. * **Multimapped**: Read has multiple occurence. * **Not aligned**: Read has no occurence. ''', plot = bargraph.plot(self.bowtie_data, keys, config) )
def slamdunkUtrRatesPlot (self): """ Generate the UTR rates plot """ cats = OrderedDict() keys = ['T>C', 'A>T', 'A>G', 'A>C', 'T>A', 'T>G', 'G>A', 'G>T', 'G>C', 'C>A', 'C>T', 'C>G'] for i, v in enumerate(keys): cats[v] = { 'color': self.plot_cols[i] } pconfig = { 'id': 'slamdunk_utrratesplot', 'title': 'Slamdunk: Overall conversion rates per UTR', 'cpswitch': False, 'cpswitch_c_active': False, 'ylab': 'Number of conversions', 'stacking': 'normal', 'tt_decimals': 2, 'tt_suffix': '%', 'tt_percentages': False, 'hide_zero_cats': False } self.add_section ( name = 'Conversion rates per UTR', anchor = 'slamdunk_utr_rates', description = """This plot shows the individual conversion rates for all UTRs (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#utrrates" target="_blank">slamdunk docs</a>).""", plot = bargraph.plot(self.utrates_data, cats, pconfig) )
def hicpro_capture_chart (self): """ Generate Capture Hi-C plot""" keys = OrderedDict() keys['valid_pairs_on_target_cap_cap'] = { 'color': '#0039e6', 'name': 'Capture-Capture interactions' } keys['valid_pairs_on_target_cap_rep'] = { 'color': '#809fff', 'name': 'Capture-Reporter interactions' } keys['valid_pairs_off_target'] = { 'color': '#cccccc', 'name': 'Off-target valid pairs' } # Check capture info are available num_samples = 0 for s_name in self.hicpro_data: for k in keys: num_samples += sum([1 if k in self.hicpro_data[s_name] else 0]) if num_samples == 0: return False # Config for the plot config = { 'id': 'hicpro_cap_plot', 'title': 'HiC-Pro: Capture Statistics', 'ylab': '# Pairs', 'cpswitch_counts_label': 'Number of Pairs' } return bargraph.plot(self.hicpro_data, keys, config)
def hicpro_as_chart (self): """ Generate Allele-specific plot""" keys = OrderedDict() keys['Valid_pairs_from_ref_genome_(1-1)'] = { 'color': '#e6550d', 'name': 'Genome1 specific read pairs (1-1)' } keys['Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)'] = { 'color': '#fdae6b', 'name': 'Genome1 with one unassigned mate (0-1/1-0)' } keys['Valid_pairs_from_alt_genome_(2-2)'] = { 'color': '#756bb1', 'name': 'Genome2 specific read pairs (2-2)' } keys['Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)'] = { 'color': '#bcbddc', 'name': 'Genome2 with one unassigned mate (0-2/2-0)' } keys['Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)'] = { 'color': '#a6611a', 'name': 'Trans homologuous read pairs (1-2/2/1)' } keys['Valid_pairs_with_both_unassigned_mated_(0-0)'] = { 'color': '#cccccc', 'name': 'Unassigned read pairs' } keys['Valid_pairs_with_at_least_one_conflicting_mate_(3-)'] = { 'color': '#a9a2a2', 'name': 'Conflicting read pairs' } # check allele-specific analysis was run num_samples = 0 for s_name in self.hicpro_data: for k in keys: num_samples += sum([1 if k in self.hicpro_data[s_name] else 0]) if num_samples == 0: return False # Config for the plot config = { 'id': 'hicpro_asan_plot', 'title': 'HiC-Pro: Allele-specific Statistics', 'ylab': '# Pairs', 'cpswitch_counts_label': 'Number of Pairs' } return bargraph.plot(self.hicpro_data, keys, config)
def hicpro_mapping_chart (self): """ Generate the HiC-Pro Aligned reads plot """ # Specify the order of the different possible categories keys = OrderedDict() keys['Full_Alignments_Read'] = { 'color': '#005ce6', 'name': 'Full reads Alignments' } keys['Trimmed_Alignments_Read'] = { 'color': '#3385ff', 'name': 'Trimmed reads Alignments' } keys['Failed_To_Align_Read'] = { 'color': '#a9a2a2', 'name': 'Failed To Align' } data = [{},{}] for s_name in self.hicpro_data: for r in [1,2]: data[r-1]['{} [R{}]'.format(s_name, r)] = { 'Full_Alignments_Read': self.hicpro_data[s_name]['global_R{}'.format(r)], 'Trimmed_Alignments_Read': self.hicpro_data[s_name]['local_R{}'.format(r)], 'Failed_To_Align_Read': int(self.hicpro_data[s_name]['total_R{}'.format(r)]) - int(self.hicpro_data[s_name]['mapped_R{}'.format(r)]) } # Config for the plot config = { 'id': 'hicpro_mapping_stats_plot', 'title': 'HiC-Pro: Mapping Statistics', 'ylab': '# Reads', 'ylab': '# Reads: Read 1', 'data_labels': [ {'name': 'Read 1', 'ylab': '# Reads: Read 1'}, {'name': 'Read 2', 'ylab': '# Reads: Read 2'} ] } return bargraph.plot(data, [keys, keys], config)
def transcript_associated_plot (self): """ Plot a bargraph showing the Transcript-associated reads """ # Plot bar graph of groups keys = OrderedDict() keys['Exonic Rate'] = { 'name': 'Exonic', 'color': '#2f7ed8' } keys['Intronic Rate'] = { 'name': 'Intronic', 'color': '#8bbc21' } keys['Intergenic Rate'] = { 'name': 'Intergenic', 'color': '#0d233a'} # Config for the plot pconfig = { 'id': 'rna_seqc_position_plot', 'title': 'RNA-SeQC: Transcript-associated reads', 'ylab': 'Ratio of Reads', 'cpswitch': False, 'ymax': 1, 'ymin': 0, 'tt_decimals': 3, 'cpswitch_c_active': False } self.add_section ( name = 'Transcript-associated reads', anchor = 'Transcript_associated', helptext = 'All of the above rates are per mapped read. Exonic Rate is the fraction mapping within exons. ' 'Intronic Rate is the fraction mapping within introns. ' 'Intergenic Rate is the fraction mapping in the genomic space between genes. ', plot = bargraph.plot(self.rna_seqc_metrics, keys, pconfig) )
def parse_samtools_rmdup(self): """ Find Samtools rmdup logs and parse their data """ self.samtools_rmdup = dict() for f in self.find_log_files('samtools/rmdup', filehandles=True): # Example below: # [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library ' ' dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'" s_name = f['s_name'] for l in f['f']: match = re.search(dups_regex, l) if match: library_name = match.group(4).strip() if library_name != '': s_name = library_name if s_name in self.samtools_rmdup: log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f['fn'], s_name)) self.add_data_source(f, s_name) self.samtools_rmdup[s_name] = dict() self.samtools_rmdup[s_name]['n_dups'] = int(match.group(1)) self.samtools_rmdup[s_name]['n_tot'] = int(match.group(2)) self.samtools_rmdup[s_name]['n_unique'] = int(match.group(2)) - int(match.group(1)) self.samtools_rmdup[s_name]['pct_dups'] = float(match.group(3))*100 # Filter to strip out ignored sample names self.samtools_rmdup = self.ignore_samples(self.samtools_rmdup) if len(self.samtools_rmdup) > 0: # Write parsed report data to a file self.write_data_file(self.samtools_rmdup, 'multiqc_samtools_rmdup') # Make a bar plot showing duplicates keys = OrderedDict() keys['n_unique'] = {'name': 'Non-duplicated reads'} keys['n_dups'] = {'name': 'Duplicated reads'} pconfig = { 'id': 'samtools_rmdup_plot', 'title': 'Samtools rmdup: Duplicate alignments', 'yDecimals': False } self.add_section ( name = 'Duplicates removed', anchor = 'samtools-rmdup', plot = bargraph.plot(self.samtools_rmdup, keys, pconfig) ) # Add a column to the General Stats table # General Stats Table stats_headers = OrderedDict() stats_headers['pct_dups'] = { 'title': '% Dups', 'description': 'Percent of duplicate alignments', 'min': 0, 'max': 100, 'suffix': '%', 'scale': 'OrRd' } self.general_stats_addcols(self.samtools_rmdup, stats_headers, 'Samtools rmdup') return len(self.samtools_rmdup)
def chart_align_strand(self): # mapping strand distribution pd1 = {} pd2 = {} for sid, dd in self.mdata['align_strand'].items(): pd1[sid] = dd['read1'] pd2[sid] = dd['read2'] self.add_section( name='Mapping Strand Distribution', anchor='biscuit-strands', description = "This plot shows the distribution of strand of mapping and strand of bisulfite conversion.", helptext="Most bisulfite libraries has read 1 goes to parent `++` or `--` and read 2 goes to daughter/synthesized `+-` or `-+`. PBAT or most single-cell/low input libraries typically don't observe this rule.", plot = bargraph.plot([pd1, pd2], [OrderedDict([ ('++', {'name':'++: Waston-Aligned, Waston-Bisulfite Conversion', 'color': '#F53855'}), ('+-', {'name':'+-: Waston-Aligned, Crick-Bisulfite Conversion', 'color': '#E37B40'}), ('-+', {'name':'-+: Crick-Aligned, Waston-Bisulfite Conversion', 'color': '#46B29D'}), ('--', {'name':'--: Crick-Aligned, Crick-Bisulfite Conversion', 'color': '#324D5C'}),]), OrderedDict([ ('++', {'name':'++: Waston-Aligned, Waston-Bisulfite Conversion', 'color': '#F53855'}), ('+-', {'name':'+-: Waston-Aligned, Crick-Bisulfite Conversion', 'color': '#E37B40'}), ('-+', {'name':'-+: Crick-Aligned, Waston-Bisulfite Conversion', 'color': '#46B29D'}), ('--', {'name':'--: Crick-Aligned, Crick-Bisulfite Conversion', 'color': '#324D5C'})])], {'id':'biscuit_strands', 'title':'BISCUIT: Mapping Strand Distribution', 'ylab':'Number of Reads', 'cpswitch_c_active': True, 'cpswitch_counts_label': '# Reads', 'data_labels': [ {'name': 'Read 1', }, {'name': 'Read 2', }] }) )
def macs_filtered_reads_plot(self): """ Plot of filtered reads for control and treatment samples """ data = dict() req_cats = ['control_fragments_total', 'control_fragments_after_filtering', 'treatment_fragments_total', 'treatment_fragments_after_filtering'] for s_name, d in self.macs_data.items(): if all([c in d for c in req_cats]): data['{}: Control'.format(s_name)] = dict() data['{}: Treatment'.format(s_name)] = dict() data['{}: Control'.format(s_name)]['fragments_filtered'] = d['control_fragments_total'] - d['control_fragments_after_filtering'] data['{}: Control'.format(s_name)]['fragments_not_filtered'] = d['control_fragments_after_filtering'] data['{}: Treatment'.format(s_name)]['fragments_filtered'] = d['treatment_fragments_total'] - d['treatment_fragments_after_filtering'] data['{}: Treatment'.format(s_name)]['fragments_not_filtered'] = d['treatment_fragments_after_filtering'] # Specify the order of the different possible categories keys = OrderedDict() keys['fragments_not_filtered'] = { 'color': '#437BB1', 'name': 'Remaining fragments' } keys['fragments_filtered'] = { 'color': '#B1084C', 'name': 'Filtered fragments' } # Config for the plot pconfig = { 'id': 'macs2_filtered', 'title': 'MACS2: Filtered Fragments', 'ylab': '# Fragments', 'cpswitch_counts_label': 'Number of Fragments', 'hide_zero_cats': False } self.add_section( plot = bargraph.plot(data, keys, pconfig) )
def adapter_removal_retained_chart(self): pconfig = { 'title': 'Adapter Removal: Discarded Reads', 'id': 'ar_retained_plot', 'ylab': '# Reads', 'hide_zero_cats': False, 'cpswitch_counts_label': 'Number of Reads' } cats_pec = OrderedDict() if self.__any_paired: cats_pec['retained_reads'] = {'name': 'Retained Read Pairs'} cats_pec['singleton_m1'] = {'name': 'Singleton R1'} if self.__any_paired: cats_pec['singleton_m2'] = {'name': 'Singleton R2'} if self.__any_collapsed: cats_pec['full-length_cp'] = {'name': 'Full-length Collapsed Pairs'} cats_pec['truncated_cp'] = {'name': 'Truncated Collapsed Pairs'} cats_pec['discarded_m1'] = {'name': 'Discarded R1'} if self.__any_paired: cats_pec['discarded_m2'] = {'name': 'Discarded R2'} self.add_section( name='Retained and Discarded Paired-End Collapsed', anchor='adapter_removal_retained_plot', description='The number of retained and discarded reads.', plot=bargraph.plot(self.adapter_removal_data, cats_pec, pconfig) )
def qorts_splice_loci_barplot (self): """ Make the HighCharts HTML to plot the qorts splice loci """ # Specify the order of the different possible categories keys = [ 'SpliceLoci_Known_ManyReads', 'SpliceLoci_Known_FewReads', 'SpliceLoci_Known_NoReads', 'SpliceLoci_Novel_ManyReads', 'SpliceLoci_Novel_FewReads', ] cats = OrderedDict() for k in keys: name = k.replace('SpliceLoci_', '').replace('_',': ') name = re.sub("([a-z])([A-Z])","\g<1> \g<2>",name) cats[k] = { 'name': name } # Config for the plot pconfig = { 'id': 'qorts_splice_loci', 'title': 'QoRTs: Splice Loci', 'ylab': '# Splice Loci', 'cpswitch_counts_label': 'Number of Splice Loci', 'hide_zero_cats': False } self.add_section( name = "Splice Loci", description = "This plot shows the number of splice junction loci of each type that appear in the sample's reads.", helptext = ''' The [QoRTs vignette](http://hartleys.github.io/QoRTs/doc/QoRTs-vignette.pdf) describes the categories in this plot as follows: * **Known**: The splice junction locus is found in the supplied transcript annotation gtf file. * **Novel**: The splice junction locus is NOT found in the supplied transcript annotation gtf file. * **Known: Few reads**: The locus is known, and is only covered by 1-3 read-pairs. * **Known: Many reads**: The locus is known, and is covered by 4 or more read-pairs. * **Novel: Few reads**: The locus is novel, and is only covered by 1-3 read-pairs. * **Novel: Many reads**: The locus is novel, and is covered by 4 or more read-pairs _What it means and what to look for:_ This plot can be used to detect a number of anomalies. For example: whether mapping or sequencing artifacts caused a disproportionate discovery of novel splice junctions in one sample or batch. It can also be used as an indicator of the comprehensiveness the genome annotation. Replicates that are obvious outliers may have sequencing/technical issues causing false detection of splice junctions. Abnormalities in the splice junction rates are generally a symptom of larger issues which will generally be picked up by other metrics. Numerous factors can reduce the efficacy by which aligners map across splice junctions, and as such these plots become very important if the intended downstream analyses include transcript assembly, transcript deconvolution, differential splicing, or any other form of analysis that in some way involves the splice junctions themselves. These plots can be used to assess whether other minor abnormalities observed in the other plots are of sufficient severity to impact splice junction mapping and thus potentially compromise such analyses. ''', plot = bargraph.plot(self.qorts_data, cats, pconfig) )
def chart_align_mapq(self): # fraction of optimally mapped reads pd = {} for sid, dd in self.mdata['align_mapq'].items(): pd[sid] = {'OAligned':0, 'SAligned':0, 'UAligned':1} for mapq, cnt in dd.items(): if mapq == 'unmapped': pd[sid]['UAligned'] += int(cnt) elif int(mapq) >= 40: pd[sid]['OAligned'] += int(cnt) else: pd[sid]['SAligned'] += int(cnt) self.add_section( name = 'Mapping Summary', anchor = 'biscuit-mapping', description = 'This shows the fraction of optimally aligned reads, which is defined by mapQ >= 40.', helptext = 'A good library should have high fraction of reads optimally aligned. Suboptimally aligned reads include both nonunique alignments and imperfect alignments.', plot = bargraph.plot(pd, OrderedDict([ ('OAligned', {'name':'Optimally Aligned Reads'}), ('SAligned', {'name':'Suboptimally Aligned Reads'}), ('UAligned', {'name':'Unaligned Reads'}) ]), {'id':'biscuit_mapping_summary', 'title':'BISCUIT: Mapping Summary', 'ylab':'Number of Reads', 'cpswitch_counts_label': '# Reads' }) ) # Mapping quality together in one plot total = {} for sid, dd in self.mdata['align_mapq'].items(): total[sid] = sum([int(cnt) for _, cnt in dd.items() if _ != "unmapped"]) pd_mapping = {} for sid, dd in self.mdata['align_mapq'].items(): mapqcnts = [] for mapq in range(61): if str(mapq) in dd: mapqcnts.append(float(dd[str(mapq)])/total[sid]*100) else: mapqcnts.append(0) pd_mapping[sid] = dict(zip(range(61), mapqcnts)) self.add_section( name = 'Mapping Quality Distribution', anchor = 'biscuit-mapq', description = "This plot shows the distribution of primary mapping quality.", plot = linegraph.plot(pd_mapping, {'id':'biscuit_mapping', 'title': 'BISCUIT: Mapping Information', 'ymin': 0, 'yLabelFormat': '{value}%', 'tt_label': '<strong>Q{point.x}:</strong> {point.y:.2f}% of reads', 'name':'Mapping Quality', 'ylab': '% Primary Mapped Reads','xlab': 'Mapping Quality'}))
def read_count_plot (self): """ Stacked bar plot showing counts of reads """ pconfig = { 'id': 'fastqc_sequence_counts_plot', 'title': 'FastQC: Sequence Counts', 'ylab': 'Number of reads', 'cpswitch_counts_label': 'Number of reads', 'hide_zero_cats': False } pdata = dict() has_dups = False has_total = False for s_name in self.fastqc_data: pd = self.fastqc_data[s_name]['basic_statistics'] pdata[s_name] = dict() try: pdata[s_name]['Duplicate Reads'] = int(((100.0 - float(pd['total_deduplicated_percentage']))/100.0) * pd['Total Sequences']) pdata[s_name]['Unique Reads'] = pd['Total Sequences'] - pdata[s_name]['Duplicate Reads'] has_dups = True except KeyError: # Older versions of FastQC don't have duplicate reads pdata[s_name] = { 'Total Sequences': pd['Total Sequences'] } has_total = True pcats = list() duptext = '' if has_total: pcats.append('Total Sequences') if has_dups: pcats.extend(['Unique Reads', 'Duplicate Reads']) duptext = ' Duplicate read counts are an estimate only.' if has_total and not has_dups: pconfig['use_legend'] = False pconfig['cpswitch'] = False self.add_section ( name = 'Sequence Counts', anchor = 'fastqc_sequence_counts', description = 'Sequence counts for each sample.'+duptext, helptext = ''' This plot show the total number of reads, broken down into unique and duplicate if possible (only more recent versions of FastQC give duplicate info). You can read more about duplicate calculation in the [FastQC documentation](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html). A small part has been copied here for convenience: _Only sequences which first appear in the first 100,000 sequences in each file are analysed. This should be enough to get a good impression for the duplication levels in the whole file. Each sequence is tracked to the end of the file to give a representative count of the overall duplication level._ _The duplication detection requires an exact sequence match over the whole length of the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis._ ''', plot = bargraph.plot(pdata, pcats, pconfig) )
def add_barplot(self): """ Generate the Samblaster bar plot. """ cats = OrderedDict() cats['n_nondups'] = {'name': 'Non-duplicates'} cats['n_dups'] = {'name': 'Duplicates'} pconfig = { 'id': 'samblaster_duplicates', 'title': 'Samblaster: Number of duplicate reads', } self.add_section( plot = bargraph.plot(self.samblaster_data, cats, pconfig) )
def featureCounts_chart (self): """ Make the featureCounts assignment rates plot """ # Config for the plot config = { 'id': 'featureCounts_assignment_plot', 'title': 'featureCounts: Assignments', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } return bargraph.plot(self.featurecounts_data, self.featurecounts_keys, config)
def qorts_alignment_barplot (self): """ Alignment statistics bar plot """ # Specify the order of the different possible categories keys = [ 'ReadPairs_UniqueGene_CDS', 'ReadPairs_UniqueGene_UTR', 'ReadPairs_AmbigGene', 'ReadPairs_NoGene_Intron', 'ReadPairs_NoGene_OneKbFromGene', 'ReadPairs_NoGene_TenKbFromGene', 'ReadPairs_NoGene_MiddleOfNowhere' ] cats = OrderedDict() for k in keys: name = k.replace('ReadPairs_', '').replace('_',': ') name = re.sub("([a-z])([A-Z])","\g<1> \g<2>",name) cats[k] = { 'name': name } # Config for the plot pconfig = { 'id': 'qorts_alignments', 'title': 'QoRTs: Alignment Locations', 'ylab': '# Read Pairs', 'cpswitch_counts_label': 'Number of Read Pairs', 'hide_zero_cats': False } self.add_section( name = "Alignments", description = "This plot displays the rate for which the sample's read-pairs are assigned to the different categories.", helptext = ''' The [QoRTs vignette](http://hartleys.github.io/QoRTs/doc/QoRTs-vignette.pdf) describes the categories in this plot as follows: * **Unique Gene**: The read-pair overlaps with the exonic segments of one and only one gene. For many downstream analyses tools, such as DESeq, DESeq2 and EdgeR, only read-pairs in this category are used. * **Ambig Gene**: The read-pair overlaps with the exons of more than one gene. * **No Gene: Intronic**: The read-pair does not overlap with the exons of any annotated gene, but appears in a region that is bridged by an annotated splice junction. * **No Gene: One kb From Gene**: The read-pair does not overlap with the exons of any annotated gene, but is within 1 kilobase from the nearest annotated gene. * **No Gene: Ten kb From Gene**: The read-pair does not overlap with the exons of any annotated gene, but is within 10 kilobases from the nearest annotated gene. * **No Gene: Middle Of Nowhere**: The read-pair does not overlap with the exons of any annotated gene, and is more than 10 kilobases from the nearest annotated gene. _What it means and what to look for:_ Outliers in these plots can indicate biological variations or the presence of large mapping problems. They may also suggest the presence of large, highly-expressed, unannotated transcripts or genes. ''', plot = bargraph.plot(self.qorts_data, cats, pconfig) )
def overrepresented_sequences (self): """Sum the percentages of overrepresented sequences and display them in a bar plot""" data = dict() for s_name in self.fastqc_data: data[s_name] = dict() try: max_pcnt = max( [ float(d['percentage']) for d in self.fastqc_data[s_name]['overrepresented_sequences']] ) total_pcnt = sum( [ float(d['percentage']) for d in self.fastqc_data[s_name]['overrepresented_sequences']] ) data[s_name]['total_overrepresented'] = total_pcnt data[s_name]['top_overrepresented'] = max_pcnt data[s_name]['remaining_overrepresented'] = total_pcnt - max_pcnt except KeyError: if self.fastqc_data[s_name]['statuses']['overrepresented_sequences'] == 'pass': data[s_name]['total_overrepresented'] = 0 data[s_name]['top_overrepresented'] = 0 data[s_name]['remaining_overrepresented'] = 0 else: log.debug("Couldn't find data for {}, invalid Key".format(s_name)) cats = OrderedDict() cats['top_overrepresented'] = { 'name': 'Top over-represented sequence' } cats['remaining_overrepresented'] = { 'name': 'Sum of remaining over-represented sequences' } # Config for the plot pconfig = { 'id': 'fastqc_overrepresented_sequencesi_plot', 'title': 'FastQC: Overrepresented sequences', 'ymin': 0, 'yCeiling': 100, 'yMinRange': 20, 'tt_decimals': 2, 'tt_suffix': '%', 'tt_percentages': False, 'ylab_format': '{value}%', 'cpswitch': False, 'ylab': 'Percentage of Total Sequences' } # Check if any samples have more than 1% overrepresented sequences, else don't make plot. if max([ x['total_overrepresented'] for x in data.values()]) < 1: plot_html = '<div class="alert alert-info">{} samples had less than 1% of reads made up of overrepresented sequences</div>'.format(len(data)) else: plot_html = bargraph.plot(data, cats, pconfig) self.add_section ( name = 'Overrepresented sequences', anchor = 'fastqc_overrepresented_sequences', description = 'The total amount of overrepresented sequences found in each library. ' + 'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/9%20Overrepresented%20Sequences.html" target="_bkank">FastQC help for further information</a>.', plot = plot_html )
def reads_by_quality_plot(self): """Make the HighCharts HTML to plot the reads by quality""" def _get_total_reads(data_dict): stat_type = self._stat_types[0] for stat_type in self._stat_types: total_key = f"Number of reads_{stat_type}" if total_key in data_dict: return data_dict[total_key], stat_type return None, None bar_data = {} stat_type = "unrecognized" # Order of keys, from >Q5 to >Q15 _range_names = { ">Q5": "<Q5", ">Q7": "Q5-7", ">Q10": "Q7-10", ">Q12": "Q10-12", ">Q15": "Q12-15", "rest": ">Q15", } for s_name, data_dict in self.nanostat_data.items(): reads_total, stat_type = _get_total_reads(data_dict) if s_name in bar_data and stat_type == "aligned": log.debug( "Sample '{s_name}' duplicated in the quality plot - ignoring aligned data" ) continue elif s_name in bar_data and stat_type == "seq summary": log.debug( "Sample '{s_name}' duplicated in the quality plot - overwriting with seq summary data" ) bar_data[s_name] = {} prev_reads = reads_total for k, range_name in _range_names.items(): if k != "rest": data_key = f"{k}_{stat_type}" reads_gt = data_dict[data_key] bar_data[s_name][range_name] = prev_reads - reads_gt if bar_data[s_name][range_name] < 0: log.error( f"Error on {s_name} {range_name} {data_key} . Negative number of reads" ) prev_reads = reads_gt else: data_key = f">Q15_{stat_type}" bar_data[s_name][range_name] = data_dict[data_key] cats = OrderedDict() keys = reversed(list(_range_names.values())) colours = mqc_colour.mqc_colour_scale("RdYlGn-rev", 0, len(_range_names)) for idx, k in enumerate(keys): cats[k] = { "name": "Reads " + k, "color": colours.get_colour(idx, lighten=1) } # Config for the plot config = { "id": "nanostat_quality_dist", "title": "NanoStat: Reads by quality", "ylab": "# Reads", "cpswitch_counts_label": "Number of Reads", } # Add the report section self.add_section( name="Reads by quality", anchor=f"nanostat_read_qualities", description= "Read counts categorised by read quality (phred score).", helptext=""" Sequencing machines assign each generated read a quality score using the [Phred scale](https://en.wikipedia.org/wiki/Phred_quality_score). The phred score represents the liklelyhood that a given read contains errors. So, high quality reads have a high score. Data may come from NanoPlot reports generated with sequencing summary files or alignment stats. If a sample has data from both, the sequencing summary is preferred. """, plot=bargraph.plot(bar_data, cats, config), )
def quast_predicted_genes_barplot(self): """ Make a bar plot showing the number and length of predicted genes for each assembly """ # Prep the data # extract the ranges given to quast with "--gene-thresholds" prefix = '# predicted genes (>= ' suffix = ' bp)' all_thresholds = sorted( list( set([ int(key[len(prefix):-len(suffix)]) for _, d in self.quast_data.items() for key in d.keys() if key.startswith(prefix) ]))) data = {} ourpat = '>= {}{} bp' theirpat = prefix + "{}" + suffix for s_name, d in self.quast_data.items(): thresholds = sorted( list( set([ int(key[len(prefix):-len(suffix)]) for _, x in self.quast_data.items() for key in x.keys() if key.startswith(prefix) ]))) if len(thresholds) < 2: continue p = dict() try: p = { ourpat.format(thresholds[-1], ""): d[theirpat.format(thresholds[-1])] } for low, high in zip(thresholds[:-1], thresholds[1:]): p[ourpat.format( low, -high )] = d[theirpat.format(low)] - d[theirpat.format(high)] assert sum(p.values()) == d[theirpat.format(0)] except AssertionError: log.warning( "Predicted gene counts didn't add up properly for \"{}\"". format(s_name)) except KeyError: log.warning( "Not all predicted gene thresholds available for \"{}\"". format(s_name)) data[s_name] = p cats = [ ourpat.format(low, -high if high else "") for low, high in zip(all_thresholds, all_thresholds[1:] + [None]) ] if len(cats) > 0: return bargraph.plot(data, cats) else: return None
def add_kraken(self): data = {} # First, we figure out all possible names kingdoms = set([ x for k in self.sequana_data.keys() for x in self.sequana_data[k].keys() ]) colors = [ 'Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'Metazoa', 'Fungi', "Unclassified", "Classified" ] for sample_name in self.sequana_data.keys(): for kingdom in sorted(kingdoms): if kingdom not in self.sequana_data[sample_name]: self.sequana_data[sample_name][kingdom] = 0 data[sample_name] = {"others": 0} for kingdom in sorted(kingdoms): if kingdom not in colors: # here we add together non-superkingdom + other artifical # sequences data[sample_name]["others"] += \ self._set_nan_to_zero(self.sequana_data[sample_name][kingdom]) else: data[sample_name][kingdom.lower()] = \ self._set_nan_to_zero(self.sequana_data[sample_name][kingdom]) data[sample_name]['unclassified'] = \ self._set_nan_to_zero(self.sequana_data[sample_name]['Unclassified']) pconfig = { "title": "Taxonomy by kingdom", #"percentages": True, "cpswitch": False, "min": 0, "max": 100, "format": '{0:.2f}', "logswitch": False, } keys = OrderedDict() # superkingdom: keys['archea'] = {'color': 'orange', 'name': 'Archea'} keys['bacteria'] = {'color': '#b1084c', 'name': 'Bacteria'} keys['eukaryota'] = {'color': 'green', 'name': 'Eukaryota'} keys['viruses'] = {'color': '#437bb1', 'name': 'Viruses'} # kingdom: keys['metazoa'] = {'color': 'green', 'name': 'Metazoa'} keys['fungi'] = {'color': 'purple', 'name': 'Fungi'} # others keys['unclassified'] = {'color': 'grey', 'name': 'Unclassified'} keys['others'] = {'color': 'blue', 'name': 'Others'} # subkingdom #keys['viridiplantae'] = {'color': 'yellow', 'name': 'Viridiplantae'} #keys['dikarya'] = {'color': 'brown', 'name': 'dikarya'} self.add_section( name='Taxonomy by kingdom', anchor='taxonomy', description= 'The following barplots summarizes the kraken analysis for each sample. ', helptext="", plot=bargraph.plot(data, keys, pconfig))
def parse_reports(self): """ Find Sentieon AlignmentSummaryMetrics reports and parse their data """ # Set up vars self.sentieon_alignment_metrics = dict() # Go through logs and find Metrics for f in self.find_log_files("sentieon/alignment_metrics", filehandles=True): parsed_data = dict() s_name = None keys = None for l in f["f"]: # New log starting if s_name is None and "AlignmentStat" in l: keys = None # Pull sample name from filename s_name = os.path.basename(f["s_name"]) s_name = self.clean_s_name(s_name, f["root"]) parsed_data[s_name] = dict() if s_name is not None: if "AlignmentStat" in l and "#SentieonCommandLine" in l: keys = f["f"].readline().strip("\n").split("\t") elif keys: vals = l.strip("\n").split("\t") if len(vals) == len(keys): # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data # to simplify things if vals[0] == "PAIR" or vals[0] == "UNPAIRED": for i, k in enumerate(keys): try: parsed_data[s_name][k] = float(vals[i]) except ValueError: parsed_data[s_name][k] = vals[i] else: s_name = None keys = None # Remove empty dictionaries for s_name in list(parsed_data.keys()): if len(parsed_data[s_name]) == 0: parsed_data.pop(s_name, None) # Manipulate sample names if multiple baits found for s_name in parsed_data.keys(): if s_name in self.sentieon_alignment_metrics: log.debug("Duplicate sample name found in {}!\ Overwriting: {}".format(f["fn"], s_name)) self.add_data_source(f, s_name, section="AlignmentSummaryMetrics") self.sentieon_alignment_metrics[s_name] = parsed_data[s_name] # Filter to strip out ignored sample names self.sentieon_alignment_metrics = self.ignore_samples( self.sentieon_alignment_metrics) if len(self.sentieon_alignment_metrics) > 0: # Write parsed data to a file self.write_data_file(self.sentieon_alignment_metrics, "multiqc_sentieon_AlignmentSummaryMetrics") # Add to general stats table self.general_stats_headers["PCT_PF_READS_ALIGNED"] = { "title": "% Aligned", "description": "Percent of aligned reads", "max": 100, "min": 0, "suffix": "%", "format": "{:,.0f}", "scale": "RdYlGn", "modify": lambda x: self.multiply_hundred(x), } for s_name in self.sentieon_alignment_metrics: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.sentieon_alignment_metrics[s_name]) # Make the bar plot of alignment read count pdata = dict() for s_name in self.sentieon_alignment_metrics.keys(): pdata[s_name] = dict() # Sentieon reports both reads for PE data. # Divide it by two as most people will expect # clusters if self.sentieon_alignment_metrics[s_name]["CATEGORY"] == "PAIR": pdata[s_name]["total_reads"] = self.sentieon_alignment_metrics[ s_name]["TOTAL_READS"] / 2 pdata[s_name][ "aligned_reads"] = self.sentieon_alignment_metrics[s_name][ "PF_READS_ALIGNED"] / 2 else: pdata[s_name]["total_reads"] = self.sentieon_alignment_metrics[ s_name]["TOTAL_READS"] pdata[s_name][ "aligned_reads"] = self.sentieon_alignment_metrics[s_name][ "PF_READS_ALIGNED"] pdata[s_name]["unaligned_reads"] = pdata[s_name][ "total_reads"] - pdata[s_name]["aligned_reads"] keys = OrderedDict() keys["aligned_reads"] = {"name": "Aligned Reads"} keys["unaligned_reads"] = {"name": "Unaligned Reads"} # Config for the plot pconfig = { "id": "sentieon_aligned_reads", "title": "Sentieon: Aligned Reads", "ylab": "# Reads", "cpswitch_counts_label": "Number of Reads", } self.add_section( name="Alignment Summary", anchor="sentieon-alignmentsummary", description="Please note that Sentieon's read counts are divided \ by two for paired-end data.", plot=bargraph.plot(pdata, keys, pconfig), ) # Return the number of detected samples to the parent module return len(self.sentieon_alignment_metrics)
def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name='bcl2fastq', anchor='bcl2fastq', href="https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html", info="can be used to both demultiplex data and convert BCL files" " to FASTQ file formats for downstream analysis." ) # Gather data from all json files self.bcl2fastq_data = dict() for myfile in self.find_log_files('bcl2fastq'): self.parse_file_as_json(myfile) # Collect counts by lane and sample (+source_files) self.bcl2fastq_bylane = dict() self.bcl2fastq_bysample = dict() self.bcl2fastq_bysample_lane = dict() self.source_files = dict() self.split_data_by_lane_and_sample() # Filter to strip out ignored sample names self.bcl2fastq_bylane = self.ignore_samples(self.bcl2fastq_bylane) self.bcl2fastq_bysample = self.ignore_samples(self.bcl2fastq_bysample) self.bcl2fastq_bysample_lane = self.ignore_samples(self.bcl2fastq_bysample_lane) # Return with Warning if no files are found if len(self.bcl2fastq_bylane) == 0 and len(self.bcl2fastq_bysample) == 0: raise UserWarning # Print source files for s in self.source_files.keys(): self.add_data_source( s_name=s, source=",".join(list(set(self.source_files[s]))), module='bcl2fastq', section='bcl2fastq-bysample' ) # Add sample counts to general stats table self.add_general_stats() self.write_data_file( {str(k): self.bcl2fastq_bylane[k] for k in self.bcl2fastq_bylane.keys()}, 'multiqc_bcl2fastq_bylane' ) self.write_data_file(self.bcl2fastq_bysample, 'multiqc_bcl2fastq_bysample') # Add section for summary stats per flow cell self.add_section ( name = 'Lane Statistics', anchor = 'bcl2fastq-lanestats', description = 'Statistics about each lane for each flowcell', plot = self.lane_stats_table() ) # Add section for counts by lane cats = OrderedDict() cats["perfect"] = {'name': 'Perfect Index Reads'} cats["imperfect"] = {'name': 'Mismatched Index Reads'} cats["undetermined"] = {'name': 'Undetermined Reads'} self.add_section ( name = 'Clusters by lane', anchor = 'bcl2fastq-bylane', description = 'Number of reads per lane (with number of perfect index reads).', helptext = """Perfect index reads are those that do not have a single mismatch. All samples of a lane are combined. Undetermined reads are treated as a third category.""", plot = bargraph.plot( self.get_bar_data_from_counts(self.bcl2fastq_bylane), cats, { 'id': 'bcl2fastq_lane_counts', 'title': 'bcl2fastq: Clusters by lane', 'ylab': 'Number of clusters', 'hide_zero_cats': False } ) ) # Add section for counts by sample # get cats for per-lane tab lcats = set() for s_name in self.bcl2fastq_bysample_lane: lcats.update(self.bcl2fastq_bysample_lane[s_name].keys()) lcats = sorted(list(lcats)) self.add_section ( name = 'Clusters by sample', anchor = 'bcl2fastq-bysample', description = 'Number of reads per sample.', helptext = """Perfect index reads are those that do not have a single mismatch. All samples are aggregated across lanes combinned. Undetermined reads are ignored. Undetermined reads are treated as a separate sample.""", plot = bargraph.plot( [ self.get_bar_data_from_counts(self.bcl2fastq_bysample), self.bcl2fastq_bysample_lane ], [cats, lcats], { 'id': 'bcl2fastq_sample_counts', 'title': 'bcl2fastq: Clusters by sample', 'hide_zero_cats': False, 'ylab': 'Number of clusters', 'data_labels': ['Index mismatches', 'Counts per lane'] } ) ) # Add section with undetermined barcodes self.add_section( name = "Undetermined barcodes by lane", anchor = "undetermine_by_lane", description = "Count of the top twenty most abundant undetermined barcodes by lanes", plot = bargraph.plot( self.get_bar_data_from_undetermined(self.bcl2fastq_bylane), None, { 'id': 'bcl2fastq_undetermined', 'title': 'bcl2fastq: Undetermined barcodes by lane', 'ylab': 'Count', 'tt_percentages': False, 'use_legend': True, 'tt_suffix': 'reads' } ) )
def bargraph(self, json, bps): # config dict for bar graph config = { "title": "HTStream: QWindowTrim Trimmed Basepairs Bargraph", 'id': "htstream_qwindowtrimmer_bargraph", 'ylab': "Samples", 'cpswitch_c_active': False, 'data_labels': [{ 'name': "Read 1" }, { 'name': "Read 2" }, { 'name': "Single End" }] } if len(json.keys()) > 150: html = '<div class="alert alert-info"> Too many samples for bargraph. </div>' return html html = "" r1_data = {} r2_data = {} se_data = {} for key in json: r1_data[key] = { "LT_R1": json[key]["Qt_Left_Trimmed_R1"], "RT_R1": json[key]["Qt_Right_Trimmed_R1"] } r2_data[key] = { "LT_R2": json[key]["Qt_Left_Trimmed_R2"], "RT_R2": json[key]["Qt_Right_Trimmed_R2"] } se_data[key] = { "LT_SE": json[key]["Qt_Left_Trimmed_SE"], "RT_SE": json[key]["Qt_Right_Trimmed_SE"] } # returns nothing if no reads were trimmed. if bps == 0: html = '<div class="alert alert-info"> No basepairs were trimmed from any sample. </div>' return html cats = [OrderedDict(), OrderedDict(), OrderedDict()] cats[0]["LT_R1"] = {'name': 'Left Trimmmed'} cats[0]["RT_R1"] = {'name': 'Right Trimmmed'} cats[1]["LT_R2"] = {'name': 'Left Trimmmed'} cats[1]["RT_R2"] = {'name': 'Right Trimmmed'} cats[2]["LT_SE"] = {'name': 'Left Trimmmed'} cats[2]["RT_SE"] = {'name': 'Right Trimmmed'} return bargraph.plot([r1_data, r2_data, se_data], cats, config)
def __init__(self): super(MultiqcModule, self).__init__( name='Supernova', anchor='supernova', href="https://www.10xgenomics.com/", info="is a de novo genome assembler 10X Genomics linked-reads.") # Headers for the supernova Table self.headers = OrderedDict() self.headers['Asm size'] = { 'description': 'assembly size (in megabases) ;only scaffolds >= 10 kb', 'modify': lambda x: x / 1000000.0, 'suffix': 'Mb', 'scale': 'YlGn' } self.headers['# Long scaffs'] = { 'description': 'number of scaffolds >= 10 kb', 'scale': 'YlGn', 'format': '{:,.0f}', } self.headers['Scaff N50'] = { 'description': 'N50 scaffold size (in kilobases)', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'RdYlGn' } self.headers['Phase N50'] = { 'description': 'N50 phase block size (in kilobases)', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'RdYlGn', 'hidden': True } self.headers['Contig N50'] = { 'description': 'N50 contig size (in kilobases)', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'RdYlGn', 'hidden': True } self.headers['Edge N50'] = { 'description': 'N50 edge size (in kilobases)', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'RdYlGn', 'hidden': True } self.headers['Mol size'] = { 'description': 'weighted mean molecule size (in kilobases); ideal 50-100', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'BuGn' } self.headers['Read len'] = { 'description': 'mean read length (in bases) after trimming; ideal 140', 'suffix': 'b', 'scale': 'PuBu', 'format': '{:,.0f}', 'hidden': True } self.headers['# Reads'] = { 'description': 'number of reads (in millions); ideal 800M-1200M for human', 'modify': lambda x: x / 1000000.0, 'suffix': 'M', 'scale': 'PuBu', } self.headers['Coverage'] = { 'description': 'effective read coverage; ideal ~42 for nominal 56x cov', 'suffix': 'x', 'scale': 'PuBu' } self.headers['% Dup'] = { 'description': 'fraction of reads that are duplicates', 'suffix': '%', 'scale': 'OrRd', } self.headers['% R2 Q30'] = { 'description': 'fraction of Q30 bases in read 2; ideal 75-85%', 'suffix': '%', 'scale': 'OrRd', } self.headers['Insert size'] = { 'description': 'median insert size (in bases); ideal 0.35-0.40 Kb', 'suffix': 'b', 'scale': 'OrRd', 'format': '{:,.0f}', 'hidden': True } self.headers['% proper'] = { 'description': 'fraction of proper read pairs; ideal >= 75%', 'suffix': '%', 'scale': 'OrRd', 'hidden': True } self.headers['Het dist'] = { 'description': 'mean distance between heterozygous SNPs (in kilobases)', 'modify': lambda x: x / 1000.0, 'suffix': 'Kb', 'scale': 'BuGn', } self.headers['% missing BC'] = { 'description': 'fraction of reads that are not barcoded', 'suffix': '%', 'scale': 'BuGn', } self.headers['Barcode N50'] = { 'description': 'N50 reads per barcode (in bases)', 'suffix': 'b', 'scale': 'BuGn', 'format': '{:,.0f}', } self.headers['% Phased'] = { 'description': 'nonduplicate and phased reads; ideal 45-50%', 'suffix': '%', 'scale': 'BuGn', 'hidden': True } reports = OrderedDict() summaries = OrderedDict() molecules = OrderedDict() kmers = OrderedDict() root_summary = {} ### Parse the input log files # report.txt files for f in self.find_log_files('supernova/report'): log.debug("Found report in: {}".format(f['root'])) sid, data = self.parse_report(f['f']) s_name = self.clean_s_name(sid, f['root']) if s_name in reports.keys(): log.debug( "Duplicate sample name found! Overwriting: {}".format( s_name)) reports[s_name] = data self.add_data_source(f, s_name=s_name, section='supernova-table') # summary.json files for f in self.find_log_files('supernova/summary'): log.debug("Found summary.json in: {}".format(f['root'])) try: sid, data = self.parse_summary(f['f']) except ValueError: log.debug("Error parsing JSON file in {}".format(f['root'])) continue except RuntimeError: log.debug("Could not find sample_id in JSON file in {}".format( f['root'])) continue s_name = self.clean_s_name(sid, f['root']) if s_name in summaries.keys(): log.debug( "Duplicate sample name found! Overwriting: {}".format( s_name)) summaries[s_name] = data self.add_data_source(f, s_name=s_name, section='supernova-table') # The plot json files do not contain sample IDs, sadly. So we need to store it somewhere. root_summary[f['root']] = sid # histogram_molecules.json files for f in self.find_log_files('supernova/molecules'): log.debug("Found histogram_molecules.json in: {}".format( f['root'])) try: if f['root'] in root_summary.keys(): data = self.parse_histogram(f['f']) sid = root_summary[f['root']] s_name = self.clean_s_name(sid, f['root']) molecules[s_name] = data self.add_data_source(f, s_name=s_name, section='supernova-molecules') except RuntimeError: log.debug("Could not parse JSON file in {}".format(f['root'])) continue # histogram_kmer_count.json files for f in self.find_log_files('supernova/kmers'): log.debug("Found histogram_kmer_count.json in: {}".format( f['root'])) try: if f['root'] in root_summary.keys(): data = self.parse_histogram(f['f'], 400) sid = root_summary[f['root']] s_name = self.clean_s_name(sid, f['root']) kmers[s_name] = data self.add_data_source(f, s_name=s_name, section='supernova-kmers') except RuntimeError: log.debug("Could not parse JSON file in {}".format(f['root'])) continue # Data from summary.json supersedes data from report.txt for sample_id, sum_data in summaries.items(): if sample_id in reports.keys(): log.debug( "Found summary data for sample {} which supersedes report data" .format(sample_id)) reports[sample_id] = sum_data # Ignore cmd-line specified samples reports = self.ignore_samples(reports) molecules = self.ignore_samples(molecules) kmers = self.ignore_samples(kmers) if len(reports) == 0: log.debug("Could not find any reports in {}".format( config.analysis_dir)) raise UserWarning else: log.info("Found {} reports".format(len(reports.keys()))) ### Write the report self.write_data_file(reports, 'multiqc_supernova') config_table = {'id': 'supernova_table', 'namespace': 'supernova'} self.add_section ( name = 'Assembly statistics', anchor = 'supernova-table', description = 'Statistics gathered from the summary report(s) of Supernova. Note! ' \ 'There are more columns available but they are hidden by default.', helptext = 'As a bare minimum these numbers are generated from the file report.txt, ' \ 'found in the folder `sampleID/outs/`. If available the stats in the report ' \ 'file will be superseded by the higher precision numbers found in the file ' \ '`sampleID/outs/assembly/stats/summary.json`', plot = table.plot(reports, self.headers, config_table) ) # N50 barcharts n50_cats = [{ 'Scaff N50': { 'name': 'Scaffold N50', 'color': '#66c2a5' } }, { 'Contig N50': { 'name': 'Contig N50', 'color': '#fc8d62' } }, { 'Edge N50': { 'name': 'Edge N50', 'color': '#8da0cb' } }, { 'Phase N50': { 'name': 'Phase block N50', 'color': '#e78ac3' } }] config_n50 = { 'id': 'supernova_n50', 'title': 'Supernova N50 statistics', 'cpswitch': False, 'data_labels': ['Scaffold N50', 'Contig N50', 'Edge N50', 'Phase block N50'] } self.add_section ( name = 'N50 statistics', anchor = 'supernova-n50', description = 'Assembly N50 values - the shortest sequence length at 50% of the genome when sorted by size (see [wikipedia](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50)).', helptext = "Note that assembly size and N50 values are computed after removing scaffolds ≤ 10 kb and do not count `N`s: \n\n" \ "* **Scaffold N50** - N50 size of scaffolds in bases, \n" \ "* **Contig N50** - N50 size of contigs in bases, \n" \ "* **Edge N50** - N50 size of raw graph assembly edges in bases, \n" \ "* **Phase block N50** - N50 size of phase blocks in bases. \n\n" \ '[(source)](https://support.10xgenomics.com/de-novo-assembly/software/pipelines/latest/output/asm-stats)', plot = bargraph.plot([reports,reports,reports,reports], n50_cats, config_n50) ) # Conditional sections if len(molecules) > 0: # Remove the long tail max_x = self.trim_tail(molecules, 100000) # Add molecules plot config_molecules = { 'id': 'supernova_molecules', 'title': 'Supernova Molecule Lengths', 'xlab': 'Inferred molecule length (bp)', 'ylab': '# molecules', 'smooth_points': 300, 'smooth_points_sumcounts': True, 'xmax': max_x } self.add_section ( name = 'Molecule Lengths', anchor = 'supernova-molecules', description = 'Shows the inferred molecule lengths of the input 10X library.', helptext = 'Inferred in the `patch` step of the Supernova pipeline. It is worth ' \ 'keeping in mind that the mean molecule length from the report is a length-weighted mean. ' \ 'See the [source code](https://github.com/10XGenomics/supernova/search?q=lw_mean_mol_len&type=) ' \ 'for how this value is calculated.', plot = linegraph.plot(molecules, config_molecules) ) if len(kmers) > 0: # Remove the long tail max_x = self.trim_tail(kmers, 50) # Add kmers plot config_kmers = { 'id': 'supernova_kmers', 'title': 'Supernova Kmer Counts', 'xlab': 'Filtered kmer multiplicity', 'ylab': 'Counts', 'smooth_points_sumcounts': False, 'xmax': max_x } self.add_section ( name = 'K-mer counts', anchor = 'supernova-kmers', description = 'Shows the k-mer frequencies of the input data to Supernova (after filtering).', helptext = 'This data is generated from k-merizing the input read data, where the sequences are ' \ 'transformed in to the set of all possible sub-sequences of a fixed length of `K` (Supernova uses `K=48`). ' \ 'The plot shows on the x-axis the multiplicity (i.e. how many times are they repeated) of these k-mers ' \ 'and the y-axis the number of k-mers at this level of multiplicity. ' \ 'A careful reading of this plot can give some insights into the levels of heterozygosity and repeats ' \ 'in the genome that was sequenced and indications if the sequencing experiment was successful.', plot = linegraph.plot(kmers, config_kmers) )
def bustools_section(self): """Add bargraphs showing the mean UMIs per barcode and percentages in whitelist""" # add the summary table tconfig = { "namespace": "Bustools", "id": "bustools_summary", "table_title": "Bustools Summary Table" } self.add_section( name="Summary table", anchor="bustools-inspect", description= "This is a table of the complete output of bustools inspect. Note that some columns are hidden by default (click <em>Configure Columns</em> to show).", plot=table.plot(self.bustools_data, self.headers, tconfig), ) # also make some nice barplots # barplot for mean umis per sample mean_umis = { sample: { "UMIs per barcode": values["meanUMIsPerBarcode"] } for sample, values in self.bustools_data.items() } self.add_section( name="Mean number of UMIs per barcode", anchor="bustools-umis", description= "Average number of UMIs (unique molecular identifiers) per barcode", helptext= "Each unique barcode represents a cell and each Unique Molecular Identifier (UMI) represents " "a unique transcript molecule. By counting the mean number of UMIs per barcode, you " "effectively calculate the average number of unique transcripts per cell.", plot=bargraph.plot( mean_umis, pconfig={ "id": "bus_umis", "title": "Bustools: Mean number of UMIs per barcode per sample", "cpswitch": False, "tt_percentages": False, "ylab": "Mean UMIs per barcode", }, ), ) # barplot for the percentage of reads and barcodes on the whitelist percentage_whitelist = { sample: { "Reads on whitelist": values["percentageReadsOnWhitelist"], "Barcodes on whitelist": values["percentageBarcodesOnWhitelist"], } for sample, values in self.bustools_data.items() } self.add_section( name="Percentage in whitelist", anchor="bustools-reads", description= "The whitelist is a list of unique barcodes used in your protocol, either provided or inferred from the data.", helptext= "Each unique barcode from the whitelist represents a cell. The percentage of " "reads with barcode / barcodes in the whitelist is a measure of percentage of reads that could " "be asigned to a cell.", plot=bargraph.plot( percentage_whitelist, pconfig={ "id": "bus_reads", "title": "Bustools: Barcodes / reads with barcodes in the whitelist", "ymax": 100, "ymix": 0, "cpswitch": False, "tt_percentages": False, "ylab": "Percentage of barcodes / reads with barcodes in the whitelist", "stacking": None, "ylab_format": "{value}%", }, ), )
def hisat2_alignment_plot(self): """Make the HighCharts HTML to plot the alignment rates""" # Split the data into SE and PE sedata = {} pedata = {} for s_name, data in self.hisat2_data.items(): if "paired_total" in data: # Save half 'pairs' of mate counts m_keys = [ "unpaired_total", "unpaired_aligned_none", "unpaired_aligned_one", "unpaired_aligned_multi" ] for k in m_keys: if k in data: data[k] = float(data[k]) / 2.0 pedata[s_name] = data else: sedata[s_name] = data # Two plots, don't mix SE with PE if len(sedata) > 0: sekeys = OrderedDict() sekeys["unpaired_aligned_one"] = { "color": "#20568f", "name": "SE mapped uniquely" } sekeys["unpaired_aligned_multi"] = { "color": "#f7a35c", "name": "SE multimapped" } sekeys["unpaired_aligned_none"] = { "color": "#981919", "name": "SE not aligned" } pconfig = { "id": "hisat2_se_plot", "title": "HISAT2: SE Alignment Scores", "ylab": "# Reads", "cpswitch_counts_label": "Number of Reads", } self.add_section(plot=bargraph.plot(sedata, sekeys, pconfig)) if len(pedata) > 0: pekeys = OrderedDict() pekeys["paired_aligned_one"] = { "color": "#20568f", "name": "PE mapped uniquely" } pekeys["paired_aligned_discord_one"] = { "color": "#5c94ca", "name": "PE mapped discordantly uniquely" } pekeys["unpaired_aligned_one"] = { "color": "#95ceff", "name": "PE one mate mapped uniquely" } pekeys["paired_aligned_multi"] = { "color": "#f7a35c", "name": "PE multimapped" } pekeys["unpaired_aligned_multi"] = { "color": "#ffeb75", "name": "PE one mate multimapped" } pekeys["unpaired_aligned_none"] = { "color": "#981919", "name": "PE neither mate aligned" } pconfig = { "id": "hisat2_pe_plot", "title": "HISAT2: PE Alignment Scores", "ylab": "# Reads", "cpswitch_counts_label": "Number of Reads", } self.add_section( description= "<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>", plot=bargraph.plot(pedata, pekeys, pconfig), )
def parse_reports(self): """ Find Picard MarkDuplicates reports and parse their data """ # Set up vars self.picard_dupMetrics_data = dict() # Go through logs and find Metrics for f in self.find_log_files('picard/markdups', filehandles=True): s_name = None for l in f['f']: # New log starting if 'markduplicates' in l.lower() and 'input' in l.lower(): s_name = None # Pull sample name from input fn_search = re.search(r"INPUT=(\[?[^\s]+\]?)", l, flags=re.IGNORECASE) if fn_search: s_name = os.path.basename(fn_search.group(1).strip('[]')) s_name = self.clean_s_name(s_name, f['root']) # When run with GATK this has a different format else: fn_search = re.search(r"--input (\[?[^\s]+\]?)", l, flags=re.IGNORECASE) if fn_search: s_name = os.path.basename( fn_search.group(1).strip('[]')) s_name = self.clean_s_name(s_name, f['root']) if s_name is not None: if 'DuplicationMetrics' in l and '## METRICS CLASS' in l: if s_name in self.picard_dupMetrics_data: log.debug( "Duplicate sample name found in {}! Overwriting: {}" .format(f['fn'], s_name)) self.add_data_source(f, s_name, section='DuplicationMetrics') self.picard_dupMetrics_data[s_name] = dict() keys = f['f'].readline().rstrip("\n").split("\t") vals = f['f'].readline().rstrip("\n").split("\t") for i, k in enumerate(keys): try: self.picard_dupMetrics_data[s_name][k] = float( vals[i]) except ValueError: self.picard_dupMetrics_data[s_name][k] = vals[i] # Check that this sample had some reads if self.picard_dupMetrics_data[s_name].get('READ_PAIRS_EXAMINED', 0) == 0 and \ self.picard_dupMetrics_data[s_name].get('UNPAIRED_READS_EXAMINED', 0) == 0: self.picard_dupMetrics_data.pop(s_name, None) log.warn( "Skipping MarkDuplicates sample '{}' as log contained no reads" .format(s_name)) s_name = None for s_name in list(self.picard_dupMetrics_data.keys()): if len(self.picard_dupMetrics_data[s_name]) == 0: self.picard_dupMetrics_data.pop(s_name, None) log.debug("Removing {} as no data parsed".format(s_name)) # Filter to strip out ignored sample names self.picard_dupMetrics_data = self.ignore_samples( self.picard_dupMetrics_data) if len(self.picard_dupMetrics_data) > 0: # Write parsed data to a file self.write_data_file(self.picard_dupMetrics_data, 'multiqc_picard_dups') # Add to general stats table self.general_stats_headers['PERCENT_DUPLICATION'] = { 'title': '% Dups', 'description': 'MarkDuplicates - Percent Duplication', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'OrRd', 'modify': lambda x: self.multiply_hundred(x) } for s_name in self.picard_dupMetrics_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.picard_dupMetrics_data[s_name]) # Make the bar plot and add to the MarkDuplicates section # NOTE: I had a hard time getting these numbers to add up as expected. # If you think I've done something wrong, let me know! Please add an # issue here: https://github.com/ewels/MultiQC/issues for sn in self.picard_dupMetrics_data.keys(): self.picard_dupMetrics_data[sn][ 'UNPAIRED_READ_UNIQUE'] = self.picard_dupMetrics_data[sn][ 'UNPAIRED_READS_EXAMINED'] - self.picard_dupMetrics_data[ sn]['UNPAIRED_READ_DUPLICATES'] self.picard_dupMetrics_data[sn][ 'READ_PAIR_NOT_OPTICAL_DUPLICATES'] = self.picard_dupMetrics_data[ sn]['READ_PAIR_DUPLICATES'] - self.picard_dupMetrics_data[ sn]['READ_PAIR_OPTICAL_DUPLICATES'] self.picard_dupMetrics_data[sn][ 'READ_PAIR_UNIQUE'] = self.picard_dupMetrics_data[sn][ 'READ_PAIRS_EXAMINED'] - self.picard_dupMetrics_data[sn][ 'READ_PAIR_DUPLICATES'] keys = OrderedDict() keys_r = [ 'READ_PAIR_UNIQUE', 'UNPAIRED_READ_UNIQUE', 'READ_PAIR_NOT_OPTICAL_DUPLICATES', 'READ_PAIR_OPTICAL_DUPLICATES', 'UNPAIRED_READ_DUPLICATES', 'UNMAPPED_READS' ] for k in keys_r: keys[k] = {'name': k.replace('_', ' ').title()} # Config for the plot pconfig = { 'id': 'picard_deduplication', 'title': 'Picard: Deduplication Stats', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads', 'cpswitch_c_active': False } self.add_section(name='Mark Duplicates', anchor='picard-markduplicates', plot=bargraph.plot(self.picard_dupMetrics_data, keys, pconfig)) # Return the number of detected samples to the parent module return len(self.picard_dupMetrics_data)
def parse_reports(self): """ Find Sentieon AlignmentSummaryMetrics reports and parse their data """ # Set up vars self.sentieon_alignment_metrics = dict() # Go through logs and find Metrics for f in self.find_log_files('sentieon/alignment_metrics', filehandles=True): parsed_data = dict() s_name = None keys = None for l in f['f']: # New log starting if s_name is None and 'AlignmentStat' in l: keys = None # Pull sample name from filename s_name = os.path.basename(f['s_name']) s_name = self.clean_s_name(s_name, f['root']) parsed_data[s_name] = dict() if s_name is not None: if 'AlignmentStat' in l and '#SentieonCommandLine' in l: keys = f['f'].readline().strip("\n").split("\t") elif keys: vals = l.strip("\n").split("\t") if len(vals) == len(keys): # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data # to simplify things if vals[0] == 'PAIR' or vals[0] == 'UNPAIRED': for i, k in enumerate(keys): try: parsed_data[s_name][k] = float(vals[i]) except ValueError: parsed_data[s_name][k] = vals[i] else: s_name = None keys = None # Remove empty dictionaries for s_name in list(parsed_data.keys()): if len(parsed_data[s_name]) == 0: parsed_data.pop(s_name, None) # Manipulate sample names if multiple baits found for s_name in parsed_data.keys(): if s_name in self.sentieon_alignment_metrics: log.debug("Duplicate sample name found in {}!\ Overwriting: {}".format(f['fn'], s_name)) self.add_data_source(f, s_name, section='AlignmentSummaryMetrics') self.sentieon_alignment_metrics[s_name] = parsed_data[s_name] # Filter to strip out ignored sample names self.sentieon_alignment_metrics = self.ignore_samples( self.sentieon_alignment_metrics) if len(self.sentieon_alignment_metrics) > 0: # Write parsed data to a file self.write_data_file(self.sentieon_alignment_metrics, 'multiqc_sentieon_AlignmentSummaryMetrics') # Add to general stats table self.general_stats_headers['PCT_PF_READS_ALIGNED'] = { 'title': '% Aligned', 'description': 'Percent of aligned reads', 'max': 100, 'min': 0, 'suffix': '%', 'format': '{:,.0f}', 'scale': 'RdYlGn', 'modify': lambda x: self.multiply_hundred(x) } for s_name in self.sentieon_alignment_metrics: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.sentieon_alignment_metrics[s_name]) # Make the bar plot of alignment read count pdata = dict() for s_name in self.sentieon_alignment_metrics.keys(): pdata[s_name] = dict() # Sentieon reports both reads for PE data. # Divide it by two as most people will expect # clusters if self.sentieon_alignment_metrics[s_name]['CATEGORY'] == 'PAIR': pdata[s_name]['total_reads'] = ( self.sentieon_alignment_metrics[s_name]['TOTAL_READS'] / 2) pdata[s_name]['aligned_reads'] = ( self.sentieon_alignment_metrics[s_name]['PF_READS_ALIGNED'] / 2) else: pdata[s_name]['total_reads'] = ( self.sentieon_alignment_metrics[s_name]['TOTAL_READS']) pdata[s_name]['aligned_reads'] = ( self.sentieon_alignment_metrics[s_name]['PF_READS_ALIGNED'] ) pdata[s_name]['unaligned_reads'] = ( pdata[s_name]['total_reads'] - pdata[s_name]['aligned_reads']) keys = OrderedDict() keys['aligned_reads'] = {'name': 'Aligned Reads'} keys['unaligned_reads'] = {'name': 'Unaligned Reads'} # Config for the plot pconfig = { 'id': 'sentieon_aligned_reads', 'title': 'Sentieon: Aligned Reads', 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads', } self.add_section( name='Alignment Summary', anchor='sentieon-alignmentsummary', description="Please note that Sentieon's read counts are divided \ by two for paired-end data.", plot=bargraph.plot(pdata, keys, pconfig)) # Return the number of detected samples to the parent module return len(self.sentieon_alignment_metrics)
def __init__(self): super(MultiqcModule, self).__init__( name="Supernova", anchor="supernova", href="https://www.10xgenomics.com/", info="is a de novo genome assembler 10X Genomics linked-reads.", ) # Headers for the supernova Table self.headers = OrderedDict() self.headers["Asm size"] = { "description": "assembly size (in megabases) ;only scaffolds >= 10 kb", "modify": lambda x: x / 1000000.0, "suffix": "Mb", "scale": "YlGn", } self.headers["% missing 10Kb"] = { "rid": "pct_missing_10Kb", "description": "% of base assembly missing from scaffolds >= 10 kb", "suffix": "%", "scale": "YlGn", } self.headers["# Long scaffs"] = { "rid": "num_long_scaffs", "description": "number of scaffolds >= 10 kb", "scale": "YlGn", "format": "{:,.0f}", "hidden": True, } self.headers["Scaff N50"] = { "description": "N50 scaffold size (in kilobases)", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "RdYlGn", } self.headers["Phase N50"] = { "description": "N50 phase block size (in kilobases)", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "RdYlGn", "hidden": True, } self.headers["Contig N50"] = { "description": "N50 contig size (in kilobases)", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "RdYlGn", "hidden": True, } self.headers["Edge N50"] = { "description": "N50 edge size (in kilobases)", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "RdYlGn", "hidden": True, } self.headers["Mol size"] = { "description": "weighted mean molecule size (in kilobases); ideal 50-100", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "BuGn", } self.headers["Read len"] = { "description": "mean read length (in bases) after trimming; ideal 140", "suffix": "b", "scale": "PuBu", "format": "{:,.0f}", "hidden": True, } self.headers["# Reads"] = { "rid": "num_reads", "description": "number of reads (in millions); ideal 800M-1200M for human", "modify": lambda x: x / 1000000.0, "suffix": "M", "scale": "PuBu", } self.headers["Raw coverage"] = { "description": "raw coverage; ideal ~56", "suffix": "x", "scale": "PuBu", "hidden": True, } self.headers["Coverage"] = { "description": "effective read coverage; ideal ~42 for nominal 56x cov", "suffix": "x", "scale": "PuBu", } self.headers["% Dup"] = { "rid": "pct_Dup", "description": "fraction of reads that are duplicates", "suffix": "%", "scale": "OrRd", } self.headers["% R2 Q30"] = { "rid": "pct_R2_Q30", "description": "fraction of Q30 bases in read 2; ideal 75-85%", "suffix": "%", "scale": "OrRd", } self.headers["Insert size"] = { "description": "median insert size (in bases); ideal 0.35-0.40 Kb", "suffix": "b", "scale": "OrRd", "format": "{:,.0f}", "hidden": True, } self.headers["% proper"] = { "rid": "pct_proper", "description": "fraction of proper read pairs; ideal >= 75%", "suffix": "%", "scale": "OrRd", "hidden": True, } self.headers["BC usage"] = { "description": "fraction of barcodes used; between 0 and 1", "scale": "OrRd", "hidden": True, } self.headers["Est size"] = { "description": "estimated genome size", "modify": lambda x: x / 1000000.0, "suffix": "Mb", "scale": "YlGn", "hidden": True, } self.headers["% repeats"] = { "rid": "pct_repeats", "description": "Estimated repetitive fraction (of genome)", "scale": "YlGn", "suffix": "%", "hidden": True, } self.headers["% AT"] = { "rid": "pct_AT", "description": "high AT index (of genome)", "scale": "YlGn", "suffix": "%", "hidden": True, } self.headers["Het dist"] = { "description": "mean distance between heterozygous SNPs (in kilobases)", "modify": lambda x: x / 1000.0, "suffix": "Kb", "scale": "YlGn", "format": "{:,.0f}", "hidden": True, } self.headers["p10"] = { "description": "molecule count extending 10 kb on both sides", "scale": "BuGn", "hidden": True, } self.headers["% missing BC"] = { "rid": "pct_missing_BC", "description": "fraction of reads that are not barcoded", "suffix": "%", "scale": "BuGn", } self.headers["Barcode N50"] = { "description": "N50 reads per barcode (in bases)", "suffix": "b", "scale": "BuGn", "format": "{:,.0f}", } self.headers["% Phased"] = { "rid": "pct_Phased", "description": "nonduplicate and phased reads; ideal 45-50%", "suffix": "%", "scale": "BuGn", "hidden": True, } reports = OrderedDict() summaries = OrderedDict() molecules = OrderedDict() kmers = OrderedDict() root_summary = {} ### Parse the input log files # report.txt files for f in self.find_log_files("supernova/report"): log.debug("Found report in: {}".format(f["root"])) sid, data = self.parse_report(f["f"]) s_name = self.clean_s_name(sid, f) if s_name in reports.keys(): log.debug( "Duplicate sample name found! Overwriting: {}".format( s_name)) reports[s_name] = data self.add_data_source(f, s_name=s_name, section="supernova-table") # summary.json files for f in self.find_log_files("supernova/summary"): log.debug("Found summary.json in: {}".format(f["root"])) try: sid, data = self.parse_summary(f["f"]) except ValueError: log.debug("Error parsing JSON file in {}".format(f["root"])) continue except RuntimeError: log.debug("Could not find sample_id in JSON file in {}".format( f["root"])) continue s_name = self.clean_s_name(sid, f) if s_name in summaries.keys(): log.debug( "Duplicate sample name found! Overwriting: {}".format( s_name)) summaries[s_name] = data self.add_data_source(f, s_name=s_name, section="supernova-table") # The plot json files do not contain sample IDs, sadly. So we need to store it somewhere. root_summary[f["root"]] = sid # histogram_molecules.json files for f in self.find_log_files("supernova/molecules"): log.debug("Found histogram_molecules.json in: {}".format( f["root"])) try: if f["root"] in root_summary.keys(): data = self.parse_histogram(f["f"]) sid = root_summary[f["root"]] s_name = self.clean_s_name(sid, f) molecules[s_name] = data self.add_data_source(f, s_name=s_name, section="supernova-molecules") except RuntimeError: log.debug("Could not parse JSON file in {}".format(f["root"])) continue # histogram_kmer_count.json files for f in self.find_log_files("supernova/kmers"): log.debug("Found histogram_kmer_count.json in: {}".format( f["root"])) try: if f["root"] in root_summary.keys(): data = self.parse_histogram(f["f"], 400) sid = root_summary[f["root"]] s_name = self.clean_s_name(sid, f) kmers[s_name] = data self.add_data_source(f, s_name=s_name, section="supernova-kmers") except RuntimeError: log.debug("Could not parse JSON file in {}".format(f["root"])) continue # Data from summary.json supersedes data from report.txt for sample_id, sum_data in summaries.items(): if sample_id in reports.keys(): log.debug( "Found summary data for sample {} which supersedes report data" .format(sample_id)) reports[sample_id] = sum_data # Ignore cmd-line specified samples reports = self.ignore_samples(reports) molecules = self.ignore_samples(molecules) kmers = self.ignore_samples(kmers) if len(reports) == 0: raise UserWarning else: log.info("Found {} reports".format(len(reports.keys()))) ### Write the report self.write_data_file(reports, "multiqc_supernova") config_table = {"id": "supernova_table", "namespace": "supernova"} self.add_section( name="Assembly statistics", anchor="supernova-table", description= "Statistics gathered from the summary report(s) of Supernova. Note! " "There are more columns available but they are hidden by default.", helptext= "As a bare minimum these numbers are generated from the file report.txt, " "found in the folder `sampleID/outs/`. If available the stats in the report " "file will be superseded by the higher precision numbers found in the file " "`sampleID/outs/assembly/stats/summary.json`", plot=table.plot(reports, self.headers, config_table), ) # N50 barcharts n50_cats = [ { "Scaff N50": { "name": "Scaffold N50", "color": "#66c2a5" } }, { "Contig N50": { "name": "Contig N50", "color": "#fc8d62" } }, { "Edge N50": { "name": "Edge N50", "color": "#8da0cb" } }, { "Phase N50": { "name": "Phase block N50", "color": "#e78ac3" } }, ] config_n50 = { "id": "supernova_n50", "title": "Supernova: N50 statistics", "ylab": "Scaffold N50", "cpswitch": False, "data_labels": ["Scaffold N50", "Contig N50", "Edge N50", "Phase block N50"], } self.add_section( name="N50 statistics", anchor="supernova-n50", description= "Assembly N50 values - the shortest sequence length at 50% of the genome when sorted by size (see [wikipedia](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50)).", helptext= "Note that assembly size and N50 values are computed after removing scaffolds ≤ 10 kb and do not count `N`s: \n\n" "* **Scaffold N50** - N50 size of scaffolds in bases, \n" "* **Contig N50** - N50 size of contigs in bases, \n" "* **Edge N50** - N50 size of raw graph assembly edges in bases, \n" "* **Phase block N50** - N50 size of phase blocks in bases. \n\n" "[(source)](https://support.10xgenomics.com/de-novo-assembly/software/pipelines/latest/output/asm-stats)", plot=bargraph.plot([reports, reports, reports, reports], n50_cats, config_n50), ) # Conditional sections if len(molecules) > 0: # Remove the long tail, or fail if this is a legacy empty json file try: max_x = self.trim_tail(molecules, 100000) except IndexError: log.debug( "The histogram file is empty. Skipping molecule length section" ) return # Add molecules plot config_molecules = { "id": "supernova_molecules", "title": "Supernova: Molecule Lengths", "xlab": "Inferred molecule length (bp)", "ylab": "# molecules", "smooth_points": 300, "smooth_points_sumcounts": True, "xmax": max_x, } self.add_section( name="Molecule Lengths", anchor="supernova-molecules", description= "Shows the inferred molecule lengths of the input 10X library.", helptext= "Inferred in the `patch` step of the Supernova pipeline. It is worth " "keeping in mind that the mean molecule length from the report is a length-weighted mean. " "See the [source code](https://github.com/10XGenomics/supernova/search?q=lw_mean_mol_len&type=) " "for how this value is calculated.", plot=linegraph.plot(molecules, config_molecules), ) if len(kmers) > 0: # Remove the long tail, or fail if this is a legacy empty json file try: max_x = self.trim_tail(kmers, 50) except IndexError: log.debug( "The histogram file is empty. Skipping kmers section") return # Add kmers plot config_kmers = { "id": "supernova_kmers", "title": "Supernova: Kmer Counts", "xlab": "Filtered kmer multiplicity", "ylab": "Counts", "smooth_points_sumcounts": False, "xmax": max_x, } self.add_section( name="K-mer counts", anchor="supernova-kmers", description= "Shows the k-mer frequencies of the input data to Supernova (after filtering).", helptext= "This data is generated from k-merizing the input read data, where the sequences are " "transformed in to the set of all possible sub-sequences of a fixed length of `K` (Supernova uses `K=48`). " "The plot shows on the x-axis the multiplicity (i.e. how many times are they repeated) of these k-mers " "and the y-axis the number of k-mers at this level of multiplicity. " "A careful reading of this plot can give some insights into the levels of heterozygosity and repeats " "in the genome that was sequenced and indications if the sequencing experiment was successful.", plot=linegraph.plot(kmers, config_kmers), )
def parse_bcftools_stats(self): """ Find bcftools stats logs and parse their data Bcftools stats reports contain 'sets' of data, which can have multiple vcf files each (but usually don't). Here, we treat each 'set' as a MultiQC sample, taking the first input filename for each set as the name. """ collapse_complementary = getattr(config, 'bcftools', {}).get('collapse_complementary_changes', False) if collapse_complementary: types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T'] else: types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T', 'G>A', 'G>C', 'G>T', 'T>A', 'T>C', 'T>G'] self.bcftools_stats = dict() self.bcftools_stats_indels = dict() self.bcftools_stats_vqc_snp = dict() self.bcftools_stats_vqc_transi = dict() self.bcftools_stats_vqc_transv = dict() self.bcftools_stats_vqc_indels = dict() depth_data = dict() for f in self.find_log_files('bcftools/stats'): s_names = list() for line in f['f'].splitlines(): s = line.split("\t") # Get the sample names - one per 'set' if s[0] == "ID": s_name = self.clean_s_name(s[2], f['root']) s_names.append(s_name) if s_name in self.bcftools_stats: log.debug("Duplicate sample name found! Overwriting: {}".format(s_name)) self.add_data_source(f, s_name, section='stats') self.bcftools_stats[s_name] = dict() self.bcftools_stats_indels[s_name] = dict() self.bcftools_stats_vqc_snp[s_name] = dict() self.bcftools_stats_vqc_transi[s_name] = dict() self.bcftools_stats_vqc_transv[s_name] = dict() self.bcftools_stats_vqc_indels[s_name] = dict() depth_data[s_name] = OrderedDict() self.bcftools_stats_indels[s_name][0] = None # Avoid joining line across missing 0 # Parse key stats if s[0] == "SN" and len(s_names) > 0: s_name = s_names[int(s[1])] field = s[2].strip()[:-1] field = field.replace(' ', '_') value = float(s[3].strip()) self.bcftools_stats[s_name][field] = value # Parse transitions/transversions stats if s[0] == "TSTV" and len(s_names) > 0: s_name = s_names[int(s[1])] fields = ['ts', 'tv', 'tstv', 'ts_1st_ALT', 'tv_1st_ALT', 'tstv_1st_ALT'] for i, f in enumerate(fields): value = float(s[i+2].strip()) self.bcftools_stats[s_name][f] = value # Parse substitution types if s[0] == "ST" and len(s_names) > 0: s_name = s_names[int(s[1])] rc = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} change = s[2].strip() if change not in types: change = '>'.join(rc[n] for n in change.split('>')) field = 'substitution_type_{}'.format(change) value = float(s[3].strip()) if field not in self.bcftools_stats[s_name]: self.bcftools_stats[s_name][field] = 0 self.bcftools_stats[s_name][field] += value # Indel length distributions if s[0] == "IDD" and len(s_names) > 0: s_name = s_names[int(s[1])] length = float(s[2].strip()) count = float(s[3].strip()) self.bcftools_stats_indels[s_name][length] = count # Per-sample counts if s[0] == "PSC" and len(s_names) > 0: s_name = s_names[int(s[1])] fields = ['variations_hom', 'variations_het'] for i, f in enumerate(fields): self.bcftools_stats[s_name][f] = int(s[i + 4].strip()) # Depth plots if s[0] == "DP" and len(s_names) > 0: s_name = s_names[int(s[1])] bin_name = s[2].strip() percent_sites = float(s[-1].strip()) depth_data[s_name][bin_name] = percent_sites # Variant Qualities if s[0] == "QUAL" and len(s_names) > 0: s_name = s_names[int(s[1])] quality = float(s[2].strip()) self.bcftools_stats_vqc_snp[s_name][quality] = float(s[3].strip()) self.bcftools_stats_vqc_transi[s_name][quality] = float(s[4].strip()) self.bcftools_stats_vqc_transv[s_name][quality] = float(s[5].strip()) self.bcftools_stats_vqc_indels[s_name][quality] = float(s[6].strip()) # Filter to strip out ignored sample names self.bcftools_stats = self.ignore_samples(self.bcftools_stats) if len(self.bcftools_stats) > 0: # Write parsed report data to a file self.write_data_file(self.bcftools_stats, 'multiqc_bcftools_stats') # Stats Table stats_headers = self.bcftools_stats_genstats_headers() if getattr(config, 'bcftools', {}).get('write_general_stats', True): self.general_stats_addcols(self.bcftools_stats, stats_headers, 'Bcftools Stats') if getattr(config, 'bcftools', {}).get('write_separate_table', False): self.add_section( name='Bcftools Stats', anchor='bcftools-stats', plot=table.plot(self.bcftools_stats, stats_headers)) # Make bargraph plot of substitution types keys = OrderedDict() for t in types: keys['substitution_type_{}'.format(t)] = {'name': t} pconfig = { 'id': 'bcftools-stats-subtypes', 'title': 'Bcftools Stats: Substitutions', 'ylab': '# Substitutions', 'cpswitch_counts_label': 'Number of Substitutions' } self.add_section ( name = 'Variant Substitution Types', anchor = 'bcftools-stats', plot = bargraph.plot(self.bcftools_stats, keys, pconfig) ) # Make histograms of variant quality if len(self.bcftools_stats_vqc_snp) > 0: pconfig = { 'id': 'bcftools_stats_vqc', 'title': 'Bcftools Stats: Variant Quality Count', 'ylab': 'Count', 'xlab': 'Quality', 'xDecimals': False, 'ymin': 0, 'smooth_points': 600, # 'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}', 'data_labels': [ {'name': 'Count SNP', 'ylab': 'Quality'}, {'name': 'Count Transitions', 'ylab': 'Quality'}, {'name': 'Count Transversions', 'ylab': 'Quality'}, {'name': 'Count Indels', 'ylab': 'Quality'} ] } self.add_section ( name = 'Variant Quality', anchor = 'bcftools-stats_variant_quality_plot', plot = linegraph.plot ( [self.bcftools_stats_vqc_snp, self.bcftools_stats_vqc_transi, self.bcftools_stats_vqc_transv, self.bcftools_stats_vqc_indels], pconfig) ) # Make line graph of indel lengths if len(self.bcftools_stats_indels) > 0: pconfig = { 'id': 'bcftools_stats_indel-lengths', 'title': 'Bcftools Stats: Indel Distribution', 'ylab': 'Count', 'xlab': 'InDel Length (bp)', 'xDecimals': False, 'ymin': 0, } self.add_section ( name = 'Indel Distribution', anchor = 'bcftools-stats_indel_plot', plot = linegraph.plot(self.bcftools_stats_indels, pconfig) ) # Make line graph of variants per depth if len(depth_data) > 0: pconfig = { 'id': 'bcftools_stats_depth', 'title': 'Bcftools Stats: Variant depths', 'ylab': 'Fraction of sites (%)', 'xlab': 'Variant depth', 'ymin': 0, 'ymax': 100, 'categories': True } self.add_section ( name = 'Variant depths', anchor = 'bcftools-stats_depth_plot', description = 'Read depth support distribution for called variants', plot = linegraph.plot(depth_data, pconfig) ) # Return the number of logs that were found return len(self.bcftools_stats)
def overrepresented_sequences(self): """Sum the percentages of overrepresented sequences and display them in a bar plot""" data = dict() for s_name in self.fastqc_data: data[s_name] = dict() try: max_pcnt = max([ float(d['percentage']) for d in self.fastqc_data[s_name] ['overrepresented_sequences'] ]) total_pcnt = sum([ float(d['percentage']) for d in self.fastqc_data[s_name] ['overrepresented_sequences'] ]) data[s_name]['total_overrepresented'] = total_pcnt data[s_name]['top_overrepresented'] = max_pcnt data[s_name][ 'remaining_overrepresented'] = total_pcnt - max_pcnt except KeyError: if self.fastqc_data[s_name]['statuses'][ 'overrepresented_sequences'] == 'pass': data[s_name]['total_overrepresented'] = 0 data[s_name]['top_overrepresented'] = 0 data[s_name]['remaining_overrepresented'] = 0 else: log.debug("Couldn't find data for {}, invalid Key".format( s_name)) cats = OrderedDict() cats['top_overrepresented'] = {'name': 'Top over-represented sequence'} cats['remaining_overrepresented'] = { 'name': 'Sum of remaining over-represented sequences' } # Config for the plot pconfig = { 'id': 'fastqc_overrepresented_sequencesi_plot', 'title': 'FastQC: Overrepresented sequences', 'ymin': 0, 'yCeiling': 100, 'yMinRange': 20, 'tt_decimals': 2, 'tt_suffix': '%', 'tt_percentages': False, 'ylab_format': '{value}%', 'cpswitch': False, 'ylab': 'Percentage of Total Sequences' } # Check if any samples have more than 1% overrepresented sequences, else don't make plot. if max([x['total_overrepresented'] for x in data.values()]) < 1: plot_html = '<div class="alert alert-info">{} samples had less than 1% of reads made up of overrepresented sequences</div>'.format( len(data)) else: plot_html = bargraph.plot(data, cats, pconfig) self.add_section( name='Overrepresented sequences', anchor='fastqc_overrepresented_sequences', description= 'The total amount of overrepresented sequences found in each library. ' + 'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/9%20Overrepresented%20Sequences.html" target="_bkank">FastQC help for further information</a>.', plot=plot_html)
def parse_samtools_idxstats(self): """ Find Samtools idxstats logs and parse their data """ self.samtools_idxstats = dict() for f in self.find_log_files('samtools/idxstats'): parsed_data = parse_single_report(f['f']) if len(parsed_data) > 0: if f['s_name'] in self.samtools_idxstats: log.debug( "Duplicate sample name found! Overwriting: {}".format( f['s_name'])) self.add_data_source(f, section='idxstats') self.samtools_idxstats[f['s_name']] = parsed_data # Filter to strip out ignored sample names self.samtools_idxstats = self.ignore_samples(self.samtools_idxstats) if len(self.samtools_idxstats) > 0: # Write parsed report data to a file (restructure first) self.write_data_file(self.samtools_idxstats, 'multiqc_samtools_idxstats') # Prep the data for the plots keys = list() pdata = dict() pdata_norm = dict() xy_counts = dict() # Count the total mapped reads for every chromosome chrs_mapped = defaultdict(lambda: 0) sample_mapped = defaultdict(lambda: 0) total_mapped = 0 # Cutoff, can be customised in config cutoff = float( getattr(config, 'samtools_idxstats_fraction_cutoff', 0.001)) if cutoff != 0.001: log.info('Setting idxstats cutoff to: {}%'.format(cutoff * 100.0)) for s_name in self.samtools_idxstats: for chrom in self.samtools_idxstats[s_name]: chrs_mapped[chrom] += self.samtools_idxstats[s_name][chrom] sample_mapped[s_name] += self.samtools_idxstats[s_name][ chrom] total_mapped += self.samtools_idxstats[s_name][chrom] req_reads = float(total_mapped) * cutoff chr_always = getattr(config, 'samtools_idxstats_always', []) if len(chr_always) > 0: log.info('Trying to include these chromosomes in idxstats: {}'. format(', '.join(chr_always))) chr_ignore = getattr(config, 'samtools_idxstats_ignore', []) if len(chr_ignore) > 0: log.info( 'Excluding these chromosomes from idxstats: {}'.format( ', '.join(chr_ignore))) xchr = getattr(config, 'samtools_idxstats_xchr', False) if xchr: log.info('Using "{}" as X chromosome name'.format(xchr)) ychr = getattr(config, 'samtools_idxstats_ychr', False) if ychr: log.info('Using "{}" as Y chromosome name'.format(ychr)) # Go through again and collect all of the keys that have enough counts # Also get the X/Y counts if we find them for s_name in self.samtools_idxstats: x_count = False y_count = False for chrom in self.samtools_idxstats[s_name]: if float(chrs_mapped[chrom] ) > req_reads or chrom in chr_always: if chrom not in chr_ignore and chrom not in keys: keys.append(chrom) # Collect X and Y counts if we have them mapped = self.samtools_idxstats[s_name][chrom] if xchr is not False: if str(xchr) == str(chrom): x_count = mapped else: if chrom.lower() == 'x' or chrom.lower() == 'chrx': x_count = mapped if ychr is not False: if str(ychr) == str(chrom): y_count = mapped else: if chrom.lower() == 'y' or chrom.lower() == 'chry': y_count = mapped # Only save these counts if we have both x and y if x_count and y_count: xy_counts[s_name] = {'x': x_count, 'y': y_count} # Ok, one last time. We have the chromosomes that we want to plot, # now collect the counts for s_name in self.samtools_idxstats: pdata[s_name] = OrderedDict() pdata_norm[s_name] = OrderedDict() for k in keys: try: pdata[s_name][k] = self.samtools_idxstats[s_name][k] pdata_norm[s_name][k] = float( self.samtools_idxstats[s_name] [k]) / sample_mapped[s_name] except (KeyError, ZeroDivisionError): pdata[s_name][k] = 0 pdata_norm[s_name][k] = 0 # X/Y ratio plot if len(xy_counts) > 0: xy_keys = OrderedDict() xy_keys['x'] = {'name': xchr if xchr else 'Chromosome X'} xy_keys['y'] = {'name': ychr if ychr else 'Chromosome Y'} pconfig = { 'id': 'samtools-idxstats-xy-plot', 'title': 'Samtools idxstats: chrXY mapped reads', 'ylab': 'Percent of X+Y Reads', 'cpswitch_counts_label': 'Number of Reads', 'cpswitch_percent_label': 'Percent of X+Y Reads', 'cpswitch_c_active': False } self.add_section(name='XY counts', anchor='samtools-idxstats-xy-counts', plot=bargraph.plot(xy_counts, xy_keys, pconfig)) # Mapped reads per chr line plot pconfig = { 'id': 'samtools-idxstats-mapped-reads-plot', 'title': 'Samtools idxstats: Mapped reads per contig', 'ylab': '# mapped reads', 'xlab': 'Chromosome Name', 'categories': True, 'tt_label': '<strong>{point.category}:</strong> {point.y:.2f}', 'data_labels': [{ 'name': 'Normalised Counts', 'ylab': 'Fraction of total count' }, { 'name': 'Counts', 'ylab': '# mapped reads' }] } self.add_section( name='Mapped reads per contig', anchor='samtools-idxstats', description= 'The <code>samtools idxstats</code> tool counts the number of mapped reads per chromosome / contig. ' + 'Chromosomes with < {}% of the total aligned reads are omitted from this plot.' .format(cutoff * 100), plot=linegraph.plot([pdata_norm, pdata], pconfig)) # Return the number of logs that were found return len(self.samtools_idxstats)
def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if modname == '' or modname is None: modname = 'Custom Content' # Initialise the parent object super(MultiqcModule, self).__init__(name=modname, anchor=mod['config'].get('section_anchor', c_id), href=mod['config'].get('section_href'), info=mod['config'].get('description')) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') self.add_section(plot=table.plot(mod['data'], headers, pconfig)) self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_'))) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.add_section(plot=bargraph.plot( mod['data'], mod['config'].get('categories'), pconfig)) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.add_section(plot=linegraph.plot(mod['data'], pconfig)) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.add_section(plot=scatter.plot(mod['data'], pconfig)) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.add_section(plot=heatmap.plot(mod['data'], mod['config'].get( 'xcats'), mod['config'].get('ycats'), pconfig)) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.add_section(plot=beeswarm.plot(mod['data'], pconfig)) # Raw HTML elif mod['config'].get('plot_type') == 'html': self.add_section(content=mod['data']) # Raw image file as html elif mod['config'].get('plot_type') == 'image': self.add_section(content=mod['data']) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod['config'].get('plot_type'), c_id))
def parse_reports(self): """ Find Picard TargetedPcrMetrics reports and parse their data """ # Set up vars self.picard_pcrmetrics_data = dict() self.picard_pcrmetrics_samplestats = dict() # Go through logs and find Metrics for f in self.find_log_files('picard/pcr_metrics', filehandles=True): s_name = None for l in f['f']: # New log starting if 'TargetedPcrMetrics' in l and 'INPUT' in l: s_name = None # Pull sample name from input fn_search = re.search(r"INPUT(?:=|\s+)(\[?[^\s]+\]?)", l, flags=re.IGNORECASE) if fn_search: s_name = os.path.basename(fn_search.group(1).strip('[]')) s_name = self.clean_s_name(s_name, f['root']) if s_name is not None: if 'TargetedPcrMetrics' in l and '## METRICS CLASS' in l: keys = f['f'].readline().strip("\n").split("\t") vals = f['f'].readline().strip("\n").split("\t") if len(vals) == len(keys): if s_name in self.picard_pcrmetrics_data: log.debug( "Duplicate sample name found in {}! Overwriting: {}" .format(f['fn'], s_name)) self.add_data_source(f, s_name, section='TargetedPcrMetrics') self.picard_pcrmetrics_data[s_name] = dict() for i, k in enumerate(keys): try: # Multiply percentages by 100 if k.startswith('PCT_'): vals[i] = float(vals[i]) * 100.0 self.picard_pcrmetrics_data[s_name][k] = float( vals[i]) except ValueError: self.picard_pcrmetrics_data[s_name][k] = vals[ i] # Filter to strip out ignored sample names self.picard_pcrmetrics_data = self.ignore_samples( self.picard_pcrmetrics_data) if len(self.picard_pcrmetrics_data) > 0: # Write parsed data to a file self.write_data_file(self.picard_pcrmetrics_data, 'multiqc_picard_pcrmetrics') # Add to general stats table self.general_stats_headers['PCT_AMPLIFIED_BASES'] = { 'title': '% Amplified Bases', 'description': 'The fraction of aligned bases that mapped to or near an amplicon.', 'min': 0, 'max': 100, 'suffix': '%', 'scale': 'BrBG' } self.general_stats_headers['MEDIAN_TARGET_COVERAGE'] = { 'title': 'Median Target Coverage', 'description': 'The median coverage of reads that mapped to target regions of an experiment.', 'min': 0, 'suffix': 'X', 'scale': 'GnBu', } for s_name in self.picard_pcrmetrics_data: if s_name not in self.general_stats_data: self.general_stats_data[s_name] = dict() self.general_stats_data[s_name].update( self.picard_pcrmetrics_data[s_name]) # Bar plot of ignored bases keys = OrderedDict() keys['ON_AMPLICON_BASES'] = {'name': 'On-amplicon bases'} keys['NEAR_AMPLICON_BASES'] = {'name': 'Near-amplicon bases'} keys['OFF_AMPLICON_BASES'] = { 'name': 'Off-amplicon bases', 'color': '#f28f43' } # Config for the plot pconfig = { 'id': 'picard_pcr_metrics_bases', 'title': 'Picard: PCR Amplicon Bases', 'ylab': '# Bases', 'cpswitch_counts_label': '# Bases', 'hide_zero_cats': False } self.add_section( name='PCR Amplicon Bases', anchor='picard-pcrmetrics-bases', description= 'Metrics about reads obtained from targeted PCR experiments.', helptext=''' This plot shows the number of bases aligned on or near to amplified regions of the genome. * `ON_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped to an amplified region of the genome. * `NEAR_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped to within a fixed interval of an amplified region, but not on a baited region. * `OFF_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped neither on or near an amplicon. For more information see the [Picard documentation](https://broadinstitute.github.io/picard/picard-metric-definitions.html#TargetedPcrMetrics).''', plot=bargraph.plot(self.picard_pcrmetrics_data, keys, pconfig)) # Return the number of detected samples to the parent module return len(self.picard_pcrmetrics_data)
def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name="mosdepth", anchor="mosdepth", href="https://github.com/brentp/mosdepth", info= "performs fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing", ) dist_data, cov_data, xmax, perchrom_avg_data = self.parse_cov_dist() # Filter out any samples from --ignore-samples dist_data = self.ignore_samples(dist_data) cov_data = self.ignore_samples(cov_data) perchrom_avg_data = self.ignore_samples(perchrom_avg_data) # No samples found num_samples = max(len(dist_data), len(cov_data), len(perchrom_avg_data)) if num_samples == 0: raise UserWarning log.info("Found {} reports".format(num_samples)) if dist_data: self.add_section( name="Coverage distribution", anchor="mosdepth-coverage-dist", description= "Distribution of the number of locations in the reference genome with a given depth of coverage", helptext=genome_fraction_helptext, plot=linegraph.plot( dist_data, { "id": "mosdepth-coverage-dist-id", "title": "Mosdepth: Coverage Distribution", "xlab": "Coverage (X)", "ylab": "% bases in genome/regions covered by at least X reads", "ymax": 100, "xmax": xmax, "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%", "smooth_points": 500, }, ), ) if cov_data: self.add_section( name="Coverage plot", anchor="mosdepth-coverage-cov", description= "Number of locations in the reference genome with a given depth of coverage", helptext=coverage_histogram_helptext, plot=linegraph.plot( cov_data, { "id": "mosdepth-coverage-plot-id", "title": "Mosdepth: Coverage Depth", "xlab": "Coverage (X)", "ylab": "% bases in genome/regions covered at X reads", "ymax": 100, "xmax": xmax, "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%", "smooth_points": 500, }, ), ) if perchrom_avg_data: num_contigs = max( [len(x.keys()) for x in perchrom_avg_data.values()]) if num_contigs > 1: perchrom_plot = linegraph.plot( perchrom_avg_data, { "id": "mosdepth-coverage-per-contig", "title": "Mosdepth: Coverage per contig", "xlab": "region", "ylab": "average coverage", "categories": True, "tt_decimals": 1, "tt_suffix": "x", "smooth_points": 500, }, ) else: perchrom_plot = bargraph.plot( perchrom_avg_data, pconfig={ "id": "mosdepth-coverage-per-contig", "title": "Mosdepth: Coverage per contig", "xlab": "Sample", "ylab": "Average coverage", "tt_suffix": "x", }, ) self.add_section( name="Average coverage per contig", anchor="mosdepth-coverage-per-contig-id", description="Average coverage per contig or chromosome", plot=perchrom_plot, ) if dist_data: threshs, hidden_threshs = get_cov_thresholds() self.genstats_cov_thresholds(dist_data, threshs, hidden_threshs) self.genstats_mediancov(dist_data)
def parse_samtools_rmdup(self): """Find Samtools rmdup logs and parse their data""" self.samtools_rmdup = dict() for f in self.find_log_files("samtools/rmdup", filehandles=True): # Example below: # [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library ' ' dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'" s_name = f["s_name"] for l in f["f"]: match = re.search(dups_regex, l) if match: library_name = match.group(4).strip() if library_name != "": s_name = library_name if s_name in self.samtools_rmdup: log.debug( "Duplicate sample name found in {}! Overwriting: {}" .format(f["fn"], s_name)) self.add_data_source(f, s_name) self.samtools_rmdup[s_name] = dict() self.samtools_rmdup[s_name]["n_dups"] = int(match.group(1)) self.samtools_rmdup[s_name]["n_tot"] = int(match.group(2)) self.samtools_rmdup[s_name]["n_unique"] = int( match.group(2)) - int(match.group(1)) self.samtools_rmdup[s_name]["pct_dups"] = float( match.group(3)) * 100 # Filter to strip out ignored sample names self.samtools_rmdup = self.ignore_samples(self.samtools_rmdup) if len(self.samtools_rmdup) > 0: # Write parsed report data to a file self.write_data_file(self.samtools_rmdup, "multiqc_samtools_rmdup") # Make a bar plot showing duplicates keys = OrderedDict() keys["n_unique"] = {"name": "Non-duplicated reads"} keys["n_dups"] = {"name": "Duplicated reads"} pconfig = { "id": "samtools_rmdup_plot", "title": "Samtools rmdup: Duplicate alignments", "ylab": "Number of reads", "yDecimals": False, } self.add_section( name="Duplicates removed", anchor="samtools-rmdup", plot=bargraph.plot(self.samtools_rmdup, keys, pconfig), ) # Add a column to the General Stats table # General Stats Table stats_headers = OrderedDict() stats_headers["pct_dups"] = { "title": "% Dups", "description": "Percent of duplicate alignments", "min": 0, "max": 100, "suffix": "%", "scale": "OrRd", } self.general_stats_addcols(self.samtools_rmdup, stats_headers) return len(self.samtools_rmdup)
def add_cc_section(self, c_id, mod): section_name = mod["config"].get("section_name", c_id.replace("_", " ").title()) if section_name == "" or section_name is None: section_name = "Custom Content" section_description = mod["config"].get("description", "") pconfig = mod["config"].get("pconfig", {}) if pconfig.get("id") is None: pconfig["id"] = f"{c_id}-plot" if pconfig.get("title") is None: pconfig["title"] = section_name plot = None content = None # Save the data if it's not a html string if not isinstance(mod["data"], str): self.write_data_file(mod["data"], "multiqc_{}".format(pconfig["id"])) pconfig["save_data_file"] = False # Table if mod["config"].get("plot_type") == "table": pconfig["sortRows"] = pconfig.get("sortRows", False) headers = mod["config"].get("headers") plot = table.plot(mod["data"], headers, pconfig) # Bar plot elif mod["config"].get("plot_type") == "bargraph": mod["data"] = {k: v for k, v in sorted(mod["data"].items())} plot = bargraph.plot(mod["data"], mod["config"].get("categories"), pconfig) # Line plot elif mod["config"].get("plot_type") == "linegraph": plot = linegraph.plot(mod["data"], pconfig) # Scatter plot elif mod["config"].get("plot_type") == "scatter": plot = scatter.plot(mod["data"], pconfig) # Heatmap elif mod["config"].get("plot_type") == "heatmap": plot = heatmap.plot(mod["data"], mod["config"].get("xcats"), mod["config"].get("ycats"), pconfig) # Beeswarm plot elif mod["config"].get("plot_type") == "beeswarm": plot = beeswarm.plot(mod["data"], pconfig) # Raw HTML elif mod["config"].get("plot_type") == "html": content = mod["data"] # Raw image file as html elif mod["config"].get("plot_type") == "image": content = mod["data"] # Not supplied elif mod["config"].get("plot_type") == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod["config"].get("plot_type"), c_id)) # Don't use exactly the same title / description text as the main module if section_name == self.name: section_name = None if section_description == self.info: section_description = "" self.add_section(name=section_name, anchor=c_id, description=section_description, plot=plot, content=content)
def __map_pair_dup_read_chart(self, data_by_sample): chart_data = dict() for sample_id, data in data_by_sample.items(): if (data["Not properly paired reads (discordant)"] + data["Properly paired reads"] + data["Singleton reads (itself mapped; mate unmapped)"] + data["Unmapped reads"] != data["Total reads in RG"]): log.warning( "sum of unpaired/discordant/proppaired/unmapped reads not matching total, " "skipping mapping/paired percentages plot for: {}".format( sample_id)) continue if (data[ "Number of unique & mapped reads (excl. duplicate marked reads)"] + data["Number of duplicate marked reads"] + data["Unmapped reads"] != data["Total reads in RG"]): log.warning( "sum of unique/duplicate/unmapped reads not matching total, " "skipping mapping/duplicates percentages plot for: {}". format(sample_id)) continue chart_data[sample_id] = data self.add_section( name="Mapped / paired / duplicated", anchor="dragen-mapped-paired-duplicated", description= "Distribution of reads based on pairing, duplication and mapping.", plot=bargraph.plot( [chart_data, chart_data], [ { "Number of unique & mapped reads (excl. duplicate marked reads)": { "color": "#437bb1", "name": "Unique", }, "Number of duplicate marked reads": { "color": "#f5a742", "name": "Duplicated" }, "Unmapped reads": { "color": "#b1084c", "name": "Unmapped" }, }, { "Properly paired reads": { "color": "#099109", "name": "Paired, properly" }, "Not properly paired reads (discordant)": { "color": "#c27a0e", "name": "Paired, discordant" }, "Singleton reads (itself mapped; mate unmapped)": { "color": "#912476", "name": "Singleton" }, "Unmapped reads": { "color": "#b1084c", "name": "Unmapped" }, }, ], { "id": "mapping_dup_percentage_plot", "title": "Dragen: Mapped/paired/duplicated reads per read group", "ylab": "Reads", "cpswitch_counts_label": "Reads", "data_labels": [ { "name": "Unique vs duplicated vs unmapped", "ylab": "Reads", "cpswitch_counts_label": "Reads", }, { "name": "Paired vs. discordant vs. singleton", "ylab": "Reads", "cpswitch_counts_label": "Reads", }, ], }, ), )
def parse_reports(self): """ Find Picard RnaSeqMetrics reports and parse their data """ # Set up vars self.picard_RnaSeqMetrics_data = dict() self.picard_RnaSeqMetrics_histogram = dict() # Go through logs and find Metrics for f in self.find_log_files('picard/rnaseqmetrics', filehandles=True): s_name = None in_hist = False for l in f['f']: # Catch the histogram values if s_name is not None and in_hist is True: try: sections = l.split("\t") pos = int(sections[0]) coverage = float(sections[1]) self.picard_RnaSeqMetrics_histogram[s_name][pos] = coverage except ValueError: # Reset in case we have more in this log file s_name = None in_hist = False # New log starting if 'rnaseqmetrics' in l.lower() and 'INPUT' in l: s_name = None # Pull sample name from input fn_search = re.search("INPUT=\[?([^\\s]+)\]?", l) if fn_search: s_name = os.path.basename(fn_search.group(1)) s_name = self.clean_s_name(s_name, f['root']) if s_name is not None: if 'rnaseqmetrics' in l.lower() and '## METRICS CLASS' in l: if s_name in self.picard_RnaSeqMetrics_data: log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f['fn'], s_name)) self.picard_RnaSeqMetrics_data[s_name] = dict() self.picard_RnaSeqMetrics_histogram[s_name] = dict() self.add_data_source(f, s_name, section='RnaSeqMetrics') keys = f['f'].readline().strip("\n").split("\t") vals = f['f'].readline().strip("\n").split("\t") for i, k in enumerate(keys): # Multiply percentages by 100 if k.startswith('PCT_'): try: vals[i] = float(vals[i]) * 100.0 except (ValueError, IndexError): pass # Save the key:value pairs try: self.picard_RnaSeqMetrics_data[s_name][k] = float(vals[i]) except ValueError: self.picard_RnaSeqMetrics_data[s_name][k] = vals[i] except IndexError: pass # missing data # Calculate some extra numbers if 'PF_BASES' in keys and 'PF_ALIGNED_BASES' in keys: self.picard_RnaSeqMetrics_data[s_name]['PF_NOT_ALIGNED_BASES'] = \ self.picard_RnaSeqMetrics_data[s_name]['PF_BASES'] - self.picard_RnaSeqMetrics_data[s_name]['PF_ALIGNED_BASES'] if s_name is not None and 'normalized_position All_Reads.normalized_coverage' in l: self.picard_RnaSeqMetrics_histogram[s_name] = dict() in_hist = True for key in list(self.picard_RnaSeqMetrics_data.keys()): if len(self.picard_RnaSeqMetrics_data[key]) == 0: self.picard_RnaSeqMetrics_data.pop(key, None) for s_name in list(self.picard_RnaSeqMetrics_histogram.keys()): if len(self.picard_RnaSeqMetrics_histogram[s_name]) == 0: self.picard_RnaSeqMetrics_histogram.pop(s_name, None) log.debug("Ignoring '{}' histogram as no data parsed".format(s_name)) # Filter to strip out ignored sample names self.picard_RnaSeqMetrics_data = self.ignore_samples(self.picard_RnaSeqMetrics_data) if len(self.picard_RnaSeqMetrics_data) > 0: # Write parsed data to a file self.write_data_file(self.picard_RnaSeqMetrics_data, 'multiqc_picard_RnaSeqMetrics') # Add to general stats table GenStatsHeaders = OrderedDict() GenStatsHeaders['PCT_RIBOSOMAL_BASES'] = { 'title': '% rRNA', 'description': 'Percent of aligned bases overlapping ribosomal RNA regions', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'Reds', } GenStatsHeaders['PCT_MRNA_BASES'] = { 'title': '% mRNA', 'description': 'Percent of aligned bases overlapping UTRs and coding regions of mRNA transcripts', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'Greens', } self.general_stats_addcols(self.picard_RnaSeqMetrics_data, GenStatsHeaders) # Bar plot of bases assignment bg_cats = OrderedDict() bg_cats['CODING_BASES'] = { 'name': 'Coding' } bg_cats['UTR_BASES'] = { 'name': 'UTR' } bg_cats['INTRONIC_BASES'] = { 'name': 'Intronic' } bg_cats['INTERGENIC_BASES'] = { 'name': 'Intergenic' } bg_cats['RIBOSOMAL_BASES'] = { 'name': 'Ribosomal' } bg_cats['PF_NOT_ALIGNED_BASES'] = { 'name': 'PF not aligned' } self.add_section ( name = 'RnaSeqMetrics Assignment', anchor = 'picard-rna-assignment', description = 'Number of bases in primary alignments that align to regions in the reference genome.', plot = bargraph.plot(self.picard_RnaSeqMetrics_data, bg_cats) ) # Section with histogram plot if len(self.picard_RnaSeqMetrics_histogram) > 0: # Plot the data and add section pconfig = { 'smooth_points': 500, 'smooth_points_sumcounts': [True, False], 'id': 'picard_rna_coverage', 'title': 'Normalized Coverage', 'ylab': 'Coverage', 'xlab': 'Percent through gene', 'xDecimals': False, 'tt_label': '<b>{point.x}%</b>: {point.y:.0f}', 'ymin': 0, } self.add_section ( name = 'Gene Coverage', anchor = 'picard-rna-coverage', plot = linegraph.plot(self.picard_RnaSeqMetrics_histogram, pconfig) ) # Return the number of detected samples to the parent module return len(self.picard_RnaSeqMetrics_data)
def __init__(self): super(MultiqcModule, self).__init__( name='MappingQC', anchor='mappingqc', href="https://github.com/imgag/ngs-bits", info="calculates QC metrics based on mapped NGS reads.") # quality parameters from qcML with name, accession, description self.qcml = dict() # qc data for each sample self.qcdata = dict() # parse qcml files for f in self.find_log_files('mappingqc', filecontents=True, filehandles=False): self.add_data_source(f) s_name = self.clean_s_name(f['s_name'], f['root']) self.qcdata[s_name] = self.parse_qcml(f['f']) # ignore samples if requested self.qcdata = self.ignore_samples(self.qcdata) # warn if no samples found if len(self.qcdata) == 0: raise UserWarning # add bases usable key, derived from bases usable (MB) self.qcml.pop('bases usable (MB)') self.qcml['bases usable'] = dict() self.qcml['bases usable']['description'] = 'Bases sequenced in total.' for s, kv in self.qcdata.items(): kv['bases usable'] = kv['bases usable (MB)'] * 1e6 kv.pop('bases usable (MB)') # prepare table headers, use name and description from qcML headers = { qp_key: { 'namespace': "MappingQC", 'title': qp_key, 'description': qp_entry['description'], } for qp_key, qp_entry in self.qcml.items() } headers['trimmed base %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'floor': 1, 'scale': 'PuBu' }) headers['clipped base %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'floor': 1, 'scale': 'PuRd' }) headers['mapped read %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'Reds' }) headers['bases usable'].update({ 'suffix': config.base_count_prefix, 'format': '{:,.2f}', 'modify': lambda x: x * config.base_count_multiplier, 'scale': 'Greens' }) # always available, even without target file headers['on-target read %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'Purples' }) # only available if duplicates marked try: headers['duplicate read %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'YlOrRd' }) except KeyError: pass # only available if paired-end try: headers['properly-paired read %'].update({ 'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'GnBu' }) headers['insert size'].update({ 'suffix': 'bp', 'format': '{:,.2f}', 'scale': 'RdYlGn' }) except KeyError: pass # only available if human try: headers['SNV allele frequency deviation'].update({ 'suffix': '', 'format': '{:,.2f}', 'floor': 0, 'ceiling': 10, 'minRange': 10, 'scale': 'Greys' }) except KeyError: pass # only available if target file provided coverage_values = (10, 20, 30, 50, 100, 200, 500) try: headers['target region read depth'].update({ 'suffix': 'x', 'format': '{:,.2f}' }) for x in coverage_values: headers['target region {:d}x %'.format(x)]. \ update({'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'YlGn'}) except KeyError: pass # general table: add read count and bases usable self.general_stats_addcols( self.qcdata, self.dict_ordered_subset( headers, ('bases usable', 'mapped read %', 'on-target read %', 'target region read depth'))) # write full data set to file self.write_data_file(self.qcdata, 'multiqc_mappingqc') # table with general values self.add_section( name='Overview', anchor='mappingqc-general', description='', plot=table.plot( self.qcdata, self.dict_ordered_subset( headers, ('bases usable', 'on-target read %', 'mapped read %', 'properly-paired read %', 'trimmed base %', 'clipped base %', 'duplicate read %', 'insert size', 'SNV allele frequency deviation')), pconfig={'namespace': 'MappingQC'})) if 'target region 10x %' in headers.keys(): # table with coverage values self.add_section( name='Coverage', anchor='mappingqc-coverage', description='', plot=table.plot( self.qcdata, self.dict_ordered_subset( headers, ('target region read depth', 'target region 10x %', 'target region 20x %', 'target region 30x %', 'target region 50x %', 'target region 100x %', 'target region 200x %', 'target region 500x %')), pconfig={'namespace': 'MappingQC'})) # bar plot with sequencing depth values self.add_section( name='Sequencing Depth', anchor='mappingqc-read-depth', description=self.make_description(['target region read depth' ]), plot=bargraph.plot( self.qcdata, self.dict_ordered_subset(headers, ('target region read depth', )), pconfig={ 'namespace': 'MappingQC', 'id': 'mappingqc-read-depth-plot', 'title': 'MappingQC: Target Region Sequencing Depth', 'ylab': 'coverage', 'cpswitch': False, 'tt_decimals': 2, 'tt_suffix': 'x', 'tt_percentages': False })) # bar plot with coverage values self.add_section( name='Target Coverage', anchor='mappingqc-target-coverage', description='', plot=bargraph.plot([self.qcdata] * len(coverage_values), [{ s: headers[s] } for s in [ 'target region {:d}x %'.format(x) for x in coverage_values ]], pconfig={ 'namespace': 'MappingQC', 'id': 'mappingqc-target-coverage-plot', 'title': 'MappingQC: Target Coverage Percentage', 'ylab': 'target coverage percentage', 'cpswitch': False, 'data_labels': [ '{:d}x coverage %'.format(x) for x in coverage_values ], 'ymin': 0, 'ymax': 100, 'use_legend': False, 'tt_decimals': 2, 'tt_suffix': '%', 'tt_percentages': False }))
def parse_reports(self): """ Find RSeQC junction_annotation reports and parse their data """ # Set up vars self.junction_annotation_data = dict() regexes = { "total_splicing_events": r"^Total splicing Events:\s*(\d+)$", "known_splicing_events": r"^Known Splicing Events:\s*(\d+)$", "partial_novel_splicing_events": r"^Partial Novel Splicing Events:\s*(\d+)$", "novel_splicing_events": r"^Novel Splicing Events:\s*(\d+)$", "total_splicing_junctions": r"^Total splicing Junctions:\s*(\d+)$", "known_splicing_junctions": r"^Known Splicing Junctions:\s*(\d+)$", "partial_novel_splicing_junctions": r"^Partial Novel Splicing Junctions:\s*(\d+)$", "novel_splicing_junctions": r"^Novel Splicing Junctions:\s*(\d+)$", } # Go through files and parse data using regexes for f in self.find_log_files("rseqc/junction_annotation"): d = dict() for k, r in regexes.items(): r_search = re.search(r, f["f"], re.MULTILINE) if r_search: d[k] = int(r_search.group(1)) # Calculate some percentages if "total_splicing_events" in d: t = float(d["total_splicing_events"]) if "known_splicing_events" in d: d["known_splicing_events_pct"] = ( float(d["known_splicing_events"]) / t) * 100.0 if "partial_novel_splicing_events" in d: d["partial_novel_splicing_events_pct"] = ( float(d["partial_novel_splicing_events"]) / t) * 100.0 if "novel_splicing_events" in d: d["novel_splicing_events_pct"] = ( float(d["novel_splicing_events"]) / t) * 100.0 if "total_splicing_junctions" in d: t = float(d["total_splicing_junctions"]) if "known_splicing_junctions" in d: d["known_splicing_junctions_pct"] = ( float(d["known_splicing_junctions"]) / t) * 100.0 if "partial_novel_splicing_junctions" in d: d["partial_novel_splicing_junctions_pct"] = ( float(d["partial_novel_splicing_junctions"]) / t) * 100.0 if "novel_splicing_junctions" in d: d["novel_splicing_junctions_pct"] = ( float(d["novel_splicing_junctions"]) / t) * 100.0 if len(d) > 0: if f["s_name"] in self.junction_annotation_data: log.debug( "Duplicate sample name found! Overwriting: {}".format( f["s_name"])) self.add_data_source(f, section="junction_annotation") self.junction_annotation_data[f["s_name"]] = d # Filter to strip out ignored sample names self.junction_annotation_data = self.ignore_samples( self.junction_annotation_data) if len(self.junction_annotation_data) > 0: # Write to file self.write_data_file(self.junction_annotation_data, "multiqc_rseqc_junction_annotation") # Plot junction annotations keys = [OrderedDict(), OrderedDict()] keys[0]["known_splicing_junctions"] = { "name": "Known Splicing Junctions" } keys[0]["partial_novel_splicing_junctions"] = { "name": "Partial Novel Splicing Junctions" } keys[0]["novel_splicing_junctions"] = { "name": "Novel Splicing Junctions" } keys[1]["known_splicing_events"] = {"name": "Known Splicing Events"} keys[1]["partial_novel_splicing_events"] = { "name": "Partial Novel Splicing Events" } keys[1]["novel_splicing_events"] = {"name": "Novel Splicing Events"} pconfig = { "id": "rseqc_junction_annotation_junctions_plot", "title": "RSeQC: Splicing Junctions", "ylab": "% Junctions", "cpswitch_c_active": False, "data_labels": ["Junctions", "Events"], } self.add_section( name="Junction Annotation", anchor="rseqc_junction_annotation", description= '<a href="http://rseqc.sourceforge.net/#junction-annotation-py" target="_blank">Junction annotation</a>' " compares detected splice junctions to" " a reference gene model. An RNA read can be spliced 2" " or more times, each time is called a splicing event.", plot=bargraph.plot( [self.junction_annotation_data, self.junction_annotation_data], keys, pconfig), ) # Return number of samples found return len(self.junction_annotation_data)
def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name='Long Ranger', anchor='longranger', href="https://www.10xgenomics.com/", info= "A set of analysis pipelines that perform sample demultiplexing, " "barcode processing, alignment, quality control, variant calling, phasing, " "and structural variant calling.") self.headers = OrderedDict() self.headers['large_sv_calls'] = { 'title': 'Large SVs', 'description': 'Large structural variants called by Longranger. Not including blacklisted regions.', 'format': '{:,.0f}', 'scale': 'PuRd' } self.headers['short_deletion_calls'] = { 'title': 'Short dels', 'description': 'Short deletions called by Longranger.', 'format': '{:,.0f}', 'scale': 'PuRd', 'hidden': True } self.headers['genes_phased_lt_100kb'] = { 'title': 'genes phased < 100kb', 'description': 'Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.', 'modify': lambda x: float(x) * 100.0, 'suffix': '%', 'scale': 'YlOrRd', 'hidden': True } self.headers['longest_phase_block'] = { 'title': 'Longest phased', 'description': 'Size of the longest phase block, in base pairs', 'scale': 'YlOrRd', 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'Mbp', 'hidden': True } self.headers['n50_phase_block'] = { 'title': 'N50 phased', 'description': 'N50 length of the called phase blocks, in base pairs.', 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'Mbp', 'scale': 'YlOrRd', 'hidden': True } self.headers['snps_phased'] = { 'title': 'SNPs phased', 'description': 'Percentage of called SNPs that were phased.', 'modify': lambda x: float(x) * 100.0, 'suffix': '%', 'scale': 'PuRd', 'hidden': True } self.headers['median_insert_size'] = { 'title': 'Insert size', 'description': 'Median insert size of aligned read pairs.', 'format': '{:,.0f}', 'suffix': 'bp', 'scale': 'PuBu', 'hidden': True } self.headers['on_target_bases'] = { 'title': 'On target', 'description': 'Percentage of aligned bases mapped with the target regions in targeted mode. Only bases inside the intervals of target BED file are counted.', 'suffix': '%', 'modify': lambda x: 0 if x == "" else float(x) * 100.0, 'scale': 'Greens' } self.headers['zero_coverage'] = { 'title': 'Zero cov', 'description': 'Percentage of non-N bases in the genome with zero coverage.', 'modify': lambda x: float(x) * 100.0, 'suffix': '%', 'max': 100.0, 'min': 0.0, 'scale': 'RdGy-rev' } self.headers['mean_depth'] = { 'title': 'Depth', 'description': 'Mean read depth, including PCR duplicate reads. In WGS mode, this is measured across the genome; in targeted mode, this is the measure inside targeted regions.', 'suffix': 'X', 'scale': 'PuBu' } self.headers['pcr_duplication'] = { 'title': 'PCR Dup', 'description': 'Percentage of reads marked as PCR duplicates. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.', 'suffix': '%', 'min': 15.0, 'modify': lambda x: float(x) * 100.0, 'scale': 'RdGy-rev', 'hidden': True } self.headers['mapped_reads'] = { 'title': 'Mapped', 'modify': lambda x: float(x) * 100.0, 'suffix': '%', 'description': 'Percentage of input reads that were mapped to the reference genome.', 'scale': 'PuBu', 'hidden': True } self.headers['number_reads'] = { 'title': 'M Reads', 'modify': lambda x: float(x) / 1000000.0, 'description': 'Total number of reads supplied to Long Ranger. (millions)', 'scale': 'PuBu', 'hidden': True } self.headers['molecule_length_mean'] = { 'title': 'Mol size', 'description': 'The length-weighted mean input DNA length in base pairs.', 'modify': lambda x: float(x) / 1000.0, 'suffix': 'Kbp', 'scale': 'YlGn' } self.headers['molecule_length_stddev'] = { 'title': 'Mol stddev', 'description': 'The length-weighted standard deviation of the input DNA length distribution in base pairs.', 'modify': lambda x: float(x) / 1000.0, 'suffix': 'Kbp', 'scale': 'YlGn', 'hidden': True } self.headers['n50_linked_reads_per_molecule'] = { 'title': 'N50 read per mol.', 'description': 'The N50 number of read-pairs per input DNA molecule. Half of read-pairs came from molecules with this many or greater read-pairs.', 'scale': 'BuGn', 'hidden': True } self.headers['r1_q30_bases_fract'] = { 'title': '% R1 >= Q30', 'description': 'Percentage of bases in R1 with base quality >= 30.', 'hidden': True, 'suffix': '%', 'modify': lambda x: float(x) * 100.0, 'scale': 'Purples' } self.headers['r2_q30_bases_fract'] = { 'title': '% R2 >= Q30', 'description': 'Percentage of bases in R2 with base quality >= 30.', 'suffix': '%', 'modify': lambda x: float(x) * 100.0, 'scale': 'Purples', 'hidden': True } self.headers['bc_on_whitelist'] = { 'title': 'Valid BCs', 'description': 'The Percentage of reads that carried a valid 10x barcode sequence.', 'modify': lambda x: float(x) * 100.0, 'suffix': '%', 'scale': 'BuPu', 'hidden': True, } self.headers['bc_q30_bases_fract'] = { 'title': 'BC Q30', 'description': 'Percentage of bases in the barcode with base quality >= 30.', 'suffix': '%', 'modify': lambda x: float(x) * 100.0, 'scale': 'Purples', 'hidden': True } self.headers['bc_mean_qscore'] = { 'title': 'BC Qscore', 'description': 'The mean base quality value on the barcode bases.', 'scale': 'BuPu', 'hidden': True } self.headers['mean_dna_per_gem'] = { 'title': 'DNA per gem', 'description': 'The average number of base pairs of genomic DNA loaded into each GEM. This metric is based on the observed extents of read-pairs on each molecule.', 'modify': lambda x: float(x) / 1000000.0, 'suffix': 'Mbp', 'scale': 'OrRd', 'hidden': True } self.headers['gems_detected'] = { 'title': 'M Gems', 'description': 'The number of Chromium GEMs that were collected and which generated a non-trivial number of read-pairs. (millions)', 'modify': lambda x: float(x) / 1000000.0, 'scale': 'OrRd', } self.headers['corrected_loaded_mass_ng'] = { 'title': 'Loaded (corrected)', 'description': 'The estimated number of nanograms of DNA loaded into the input well of the Chromium chip. This metric is calculated by measuring the mean amount of DNA covered by input molecules in each GEM, then multiplying by the ratio of the chip input to the sample volume in each GEM.', 'suffix': 'ng', 'scale': 'RdYlGn' } self.headers['loaded_mass_ng'] = { 'title': 'Loaded', 'description': 'This metric was found to overestimate the true loading by a factor of 1.6, due primarily to denaturation of the input DNA.', 'suffix': 'ng', 'scale': 'RdYlGn' } self.headers['instrument_ids'] = { 'title': 'Instrument ID', 'description': 'The list of instrument IDs used to generate the input reads.', 'scale': False, 'hidden': True } self.headers['longranger_version'] = { 'title': 'Long Ranger Version', 'description': 'The version of the Longranger software used to generate the results.', 'scale': False } ### Parse the data self.longranger_data = dict() self.paths_dict = dict() for f in self.find_log_files('longranger/invocation'): sid = self.parse_invocation(f['f']) self.paths_dict[os.path.basename(f['root'])] = sid running_name = 1 for f in self.find_log_files('longranger/summary'): data = self.parse_summary(f['f']) updir, _ = os.path.split(f['root']) base_updir = os.path.basename(updir) sid = 'longranger#{}'.format(running_name) if base_updir in self.paths_dict.keys(): sid = self.paths_dict[base_updir] else: log.debug('Did not find _invocation file: {}'.format(f['fn'])) running_name += 1 self.longranger_data[sid] = data # Filter to strip out ignored sample names self.longranger_data = self.ignore_samples(self.longranger_data) if len(self.longranger_data) == 0: raise UserWarning log.info("Found {} reports".format(len(self.longranger_data.keys()))) # Write parsed report data to a file self.write_data_file(self.longranger_data, 'multiqc_longranger') # Add a longranger versions column if not all the same longranger_versions = set( [d['longranger_version'] for d in self.longranger_data.values()]) version_str = '' if len(longranger_versions) == 1: version_str = " All samples were processed using Longranger version {}".format( list(longranger_versions)[0]) del (self.headers['longranger_version']) ### Write the table config_table = {'id': 'longranger_table', 'namespace': 'longranger'} self.add_section ( name = 'Run stats', anchor = 'longranger-run-stats', description = 'Statistics gathered from Longranger reports. ' \ 'There are more columns available but they are hidden by default.' + version_str, helptext = '''Parses the files `summary.csv` and `_invocation` found in the output directory of Longranger. If `_invocation` is not found the sample IDs will be missing and they will be given a running number. E.g., `longranger#1` and `longranger#2`.''', plot = table.plot(self.longranger_data, self.headers, config_table) ) ### Bar plot of phasing stats snps_phased_pct = {} genes_phased_pct = {} for s_name in self.longranger_data: snps_phased_pct[s_name] = { 'snps_phased_pct': float(self.longranger_data[s_name]['snps_phased']) * 100.0 } genes_phased_pct[s_name] = { 'genes_phased_pct': float(self.longranger_data[s_name]['genes_phased_lt_100kb']) * 100.0 } phase_plot_cats = [OrderedDict(), OrderedDict(), OrderedDict()] phase_plot_cats[0]['longest_phase_block'] = { 'name': 'Longest Phase Block' } phase_plot_cats[0]['n50_phase_block'] = {'name': 'N50 of Phase Blocks'} phase_plot_cats[1]['snps_phased_pct'] = {'name': '% SNPs Phased'} phase_plot_cats[2]['genes_phased_pct'] = { 'name': '% Genes < 100kbp in a single phase block' } self.add_section( name='Phasing', anchor='longranger-phasing', description= 'Phasing performance from Long Ranger. Genes are only considered if ≤ 100kbp in length and with at least one heterozygous SNP.', helptext=''' * Longest phased * Size of the longest phase block, in base pairs * N50 phased * N50 length of the called phase blocks, in base pairs. * % SNPs phased * Percentage of called SNPs that were phased. * % Genes Phased * Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block. ''', plot=bargraph.plot( [self.longranger_data, snps_phased_pct, genes_phased_pct], phase_plot_cats, { 'id': 'longranger-phasing-plot', 'title': 'Long Ranger: Phasing Statistics', 'data_labels': [{ 'name': 'N50 Phased', 'ylab': 'N50 of called phase blocks (bp)' }, { 'name': '% SNPs Phased', 'ylab': '% SNPs Phased', 'ymax': 100 }, { 'name': '% Genes Phased', 'ylab': '% Genes Phased', 'ymax': 100 }], 'cpswitch': False, 'stacking': None, 'ylab': 'N50 of called phase blocks (bp)' })) ### Bar plot of mapping statistics mapping_counts_data = {} for s_name in self.longranger_data: mapped_reads = float( self.longranger_data[s_name]['number_reads']) * float( self.longranger_data[s_name]['mapped_reads']) unmapped_reads = float( self.longranger_data[s_name]['number_reads']) - mapped_reads dup_reads = mapped_reads * float( self.longranger_data[s_name]['pcr_duplication']) unique_reads = mapped_reads - dup_reads mapping_counts_data[s_name] = { 'unique_reads': unique_reads, 'dup_reads': dup_reads, 'unmapped_reads': unmapped_reads } mapping_counts_cats = OrderedDict() mapping_counts_cats['unique_reads'] = { 'name': 'Uniquely Aligned Reads', 'color': '#437bb1' } mapping_counts_cats['dup_reads'] = { 'name': 'PCR Duplicate Aligned Reads', 'color': '#7cb5ec' } mapping_counts_cats['unmapped_reads'] = { 'name': 'Unaligned Reads', 'color': '#7f0000' } self.add_section( name='Alignment', anchor='longranger-alignment', description= 'Long Ranger alignment against the reference genome. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.', plot=bargraph.plot( mapping_counts_data, mapping_counts_cats, { 'id': 'longranger-alignment-plot', 'title': 'Long Ranger: Alignment Statistics', 'ylab': 'Reads Counts', 'cpswitch_counts_label': 'Read Counts', }))
def top_five_barplot(self): """ Add a bar plot showing the top-5 from each taxa rank """ pd = [] cats = list() pconfig = { "id": "kraken-topfive-plot", "title": "Kraken 2: Top taxa", "ylab": "Number of fragments", "data_labels": list(self.t_ranks.values()), } for rank_code, rank_name in self.t_ranks.items(): rank_cats = OrderedDict() rank_data = dict() # Loop through the summed tax percentages to get the top 5 across all samples try: sorted_pct = sorted(self.kraken_total_pct[rank_code].items(), key=lambda x: x[1], reverse=True) except KeyError: # Taxa rank not found in this sample continue i = 0 counts_shown = {} for classif, pct_sum in sorted_pct: i += 1 if i > 5: break rank_cats[classif] = {"name": classif} # Pull out counts for this rank + classif from each sample for s_name, d in self.kraken_raw_data.items(): if s_name not in rank_data: rank_data[s_name] = dict() if s_name not in counts_shown: counts_shown[s_name] = 0 for row in d: if row["rank_code"] == rank_code: if row["classif"] == classif: if classif not in rank_data[s_name]: rank_data[s_name][classif] = 0 rank_data[s_name][classif] += row[ "counts_rooted"] counts_shown[s_name] += row["counts_rooted"] # Add in unclassified reads and "other" - we presume from other species etc. for s_name, d in self.kraken_raw_data.items(): for row in d: if row["rank_code"] == "U": rank_data[s_name]["U"] = row["counts_rooted"] counts_shown[s_name] += row["counts_rooted"] rank_data[s_name][ "other"] = self.kraken_sample_total_readcounts[ s_name] - counts_shown[s_name] # This should never happen... But it does sometimes if the total read count is a bit off if rank_data[s_name]["other"] < 0: log.debug( "Found negative 'other' count for {} ({}): {}".format( s_name, self.t_ranks[rank_code], rank_data[s_name]["other"])) rank_data[s_name]["other"] = 0 rank_cats["other"] = {"name": "Other", "color": "#cccccc"} rank_cats["U"] = {"name": "Unclassified", "color": "#d4949c"} cats.append(rank_cats) pd.append(rank_data) self.add_section( name="Top taxa", anchor="kraken-topfive", description= "The number of reads falling into the top 5 taxa across different ranks.", helptext=""" To make this plot, the percentage of each sample assigned to a given taxa is summed across all samples. The counts for these top five taxa are then plotted for each of the 9 different taxa ranks. The unclassified count is always shown across all taxa ranks. The total number of reads is approximated by dividing the number of `unclassified` reads by the percentage of the library that they account for. Note that this is only an approximation, and that kraken percentages don't always add to exactly 100%. The category _"Other"_ shows the difference between the above total read count and the sum of the read counts in the top 5 taxa shown + unclassified. This should cover all taxa _not_ in the top 5, +/- any rounding errors. Note that any taxon that does not exactly fit a taxon rank (eg. `-` or `G2`) is ignored. """, plot=bargraph.plot(pd, cats, pconfig), )
def parse_reports(self): """Find RSeQC read_distribution reports and parse their data""" # Set up vars self.read_dist = dict() first_regexes = { "total_reads": r"Total Reads\s+(\d+)\s*", "total_tags": r"Total Tags\s+(\d+)\s*", "total_assigned_tags": r"Total Assigned Tags\s+(\d+)\s*", } second_regexes = { "cds_exons": r"CDS_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "5_utr_exons": r"5'UTR_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "3_utr_exons": r"3'UTR_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "introns": r"Introns\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tss_up_1kb": r"TSS_up_1kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tss_up_5kb": r"TSS_up_5kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tss_up_10kb": r"TSS_up_10kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tes_down_1kb": r"TES_down_1kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tes_down_5kb": r"TES_down_5kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", "tes_down_10kb": r"TES_down_10kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*", } # Go through files and parse data using regexes for f in self.find_log_files("rseqc/read_distribution"): d = dict() for k, r in first_regexes.items(): r_search = re.search(r, f["f"], re.MULTILINE) if r_search: d[k] = int(r_search.group(1)) for k, r in second_regexes.items(): r_search = re.search(r, f["f"], re.MULTILINE) if r_search: d["{}_total_bases".format(k)] = int(r_search.group(1)) d["{}_tag_count".format(k)] = int(r_search.group(2)) d["{}_tags_kb".format(k)] = float(r_search.group(3)) d["other_intergenic_tag_count"] = d["total_tags"] - d[ "total_assigned_tags"] # Calculate some percentages for parsed file if "total_tags" in d: t = float(d["total_tags"]) pcts = dict() for k in d: if k.endswith("_tag_count"): pk = "{}_tag_pct".format(k[:-10]) pcts[pk] = (float(d[k]) / t) * 100.0 d.update(pcts) if len(d) > 0: if f["s_name"] in self.read_dist: log.debug( "Duplicate sample name found! Overwriting: {}".format( f["s_name"])) self.add_data_source(f, section="read_distribution") self.read_dist[f["s_name"]] = d # Filter to strip out ignored sample names self.read_dist = self.ignore_samples(self.read_dist) if len(self.read_dist) > 0: # Write to file self.write_data_file(self.read_dist, "multiqc_rseqc_read_distribution") # Plot bar graph of groups keys = OrderedDict() keys["cds_exons_tag_count"] = {"name": "CDS_Exons"} keys["5_utr_exons_tag_count"] = {"name": "5'UTR_Exons"} keys["3_utr_exons_tag_count"] = {"name": "3'UTR_Exons"} keys["introns_tag_count"] = {"name": "Introns"} keys["tss_up_1kb_tag_count"] = {"name": "TSS_up_1kb"} keys["tss_up_5kb_tag_count"] = {"name": "TSS_up_5kb"} keys["tss_up_10kb_tag_count"] = {"name": "TSS_up_10kb"} keys["tes_down_1kb_tag_count"] = {"name": "TES_down_1kb"} keys["tes_down_5kb_tag_count"] = {"name": "TES_down_5kb"} keys["tes_down_10kb_tag_count"] = {"name": "TES_down_10kb"} keys["other_intergenic_tag_count"] = {"name": "Other_intergenic"} # Config for the plot pconfig = { "id": "rseqc_read_distribution_plot", "title": "RSeQC: Read Distribution", "ylab": "# Tags", "cpswitch_counts_label": "Number of Tags", "cpswitch_c_active": False, } self.add_section( name="Read Distribution", anchor="rseqc-read_distribution", description= '<a href="http://rseqc.sourceforge.net/#read-distribution-py" target="_blank">Read Distribution</a>' " calculates how mapped reads are distributed over genome features.", plot=bargraph.plot(self.read_dist, keys, pconfig), ) # Return number of samples found return len(self.read_dist)
def parse_reports(self): """ Find Qualimap RNASeq reports and parse their data """ self.qualimap_rnaseq_genome_results = dict() regexes = { 'reads_aligned': r"read(?:s| pairs) aligned\s*=\s*([\d,]+)", 'total_alignments': r"total alignments\s*=\s*([\d,]+)", 'non_unique_alignments': r"non-unique alignments\s*=\s*([\d,]+)", 'reads_aligned_genes': r"aligned to genes\s*=\s*([\d,]+)", 'ambiguous_alignments': r"ambiguous alignments\s*=\s*([\d,]+)", 'not_aligned': r"not aligned\s*=\s*([\d,]+)", '5_3_bias': r"5'-3' bias\s*=\s*(\d+\.\d+)", 'reads_aligned_exonic': r"exonic\s*=\s*([\d,]+)", 'reads_aligned_intronic': r"intronic\s*=\s*([\d,]+)", 'reads_aligned_intergenic': r"intergenic\s*=\s*([\d,]+)", 'reads_aligned_overlapping_exon': r"overlapping exon\s*=\s*([\d,]+)", } for f in self.find_log_files('qualimap/rnaseq/rnaseq_results'): d = dict() # Get the sample name s_name_regex = re.search(r"bam file\s*=\s*(.+)", f['f'], re.MULTILINE) if s_name_regex: d['bam_file'] = s_name_regex.group(1) s_name = self.clean_s_name(d['bam_file'], f['root']) else: log.warn( "Couldn't find an input filename in genome_results file {}/{}". format(f['root'], f['fn'])) return None # Check for and 'fix' European style decimal places / thousand separators comma_regex = re.search(r"exonic\s*=\s*[\d\.]+ \(\d{1,3},\d+%\)", f['f'], re.MULTILINE) if comma_regex: log.debug( "Trying to fix European comma style syntax in Qualimap report {}/{}" .format(f['root'], f['fn'])) f['f'] = f['f'].replace('.', '') f['f'] = f['f'].replace(',', '.') # Go through all numeric regexes for k, r in regexes.items(): r_search = re.search(r, f['f'], re.MULTILINE) if r_search: try: d[k] = float(r_search.group(1).replace(',', '')) except UnicodeEncodeError: # Qualimap reports infinity (\u221e) when 3' bias denominator is zero pass except ValueError: d[k] = r_search.group(1) # Add to general stats table for k in ['5_3_bias', 'reads_aligned']: try: self.general_stats_data[s_name][k] = d[k] except KeyError: pass # Save results if s_name in self.qualimap_rnaseq_genome_results: log.debug( "Duplicate genome results sample name found! Overwriting: {}". format(s_name)) self.qualimap_rnaseq_genome_results[s_name] = d self.add_data_source(f, s_name=s_name, section='rna_genome_results') #### Coverage profile self.qualimap_rnaseq_cov_hist = dict() for f in self.find_log_files('qualimap/rnaseq/coverage', filehandles=True): s_name = self.get_s_name(f) d = dict() for l in f['f']: if l.startswith('#'): continue coverage, count = l.split(None, 1) coverage = int(round(float(coverage))) count = float(count) d[coverage] = count if len(d) == 0: log.debug( "Couldn't parse contents of coverage histogram file {}".format( f['fn'])) return None # Save results if s_name in self.qualimap_rnaseq_cov_hist: log.debug( "Duplicate coverage histogram sample name found! Overwriting: {}" .format(s_name)) self.qualimap_rnaseq_cov_hist[s_name] = d self.add_data_source(f, s_name=s_name, section='rna_coverage_histogram') # Filter to strip out ignored sample names self.qualimap_rnaseq_genome_results = self.ignore_samples( self.qualimap_rnaseq_genome_results) self.qualimap_rnaseq_cov_hist = self.ignore_samples( self.qualimap_rnaseq_cov_hist) #### Plots # Genomic Origin Bar Graph # NB: Ignore 'Overlapping Exon' in report - these make the numbers add up to > 100% if len(self.qualimap_rnaseq_genome_results) > 0: gorigin_cats = OrderedDict() gorigin_cats['reads_aligned_exonic'] = {'name': 'Exonic'} gorigin_cats['reads_aligned_intronic'] = {'name': 'Intronic'} gorigin_cats['reads_aligned_intergenic'] = {'name': 'Intergenic'} gorigin_pconfig = { 'id': 'qualimap_genomic_origin', 'title': 'Qualimap RNAseq: Genomic Origin', 'ylab': 'Number of reads', 'cpswitch_c_active': False } genomic_origin_helptext = ''' There are currently three main approaches to map reads to transcripts in an RNA-seq experiment: mapping reads to a reference genome to identify expressed transcripts that are annotated (and discover those that are unknown), mapping reads to a reference transcriptome, and <i>de novo</i> assembly of transcript sequences (<a href="https://doi.org/10.1186/s13059-016-0881-8" target="_blank">Conesa et al. 2016</a>). For RNA-seq QC analysis, QualiMap can be used to assess alignments produced by the first of these approaches. For input, it requires a GTF annotation file along with a reference genome, which can be used to reconstruct the exon structure of known transcripts. This allows mapped reads to be grouped by whether they originate in an exonic region (for QualiMap, this may include 5′ and 3′ UTR regions as well as protein-coding exons), an intron, or an intergenic region (see the <a href="http://qualimap.bioinfo.cipf.es/doc_html/index.html" target="_blank">Qualimap 2 documentation</a>). The inferred genomic origins of RNA-seq reads are presented here as a bar graph showing either the number or percentage of mapped reads in each read dataset that have been assigned to each type of genomic region. This graph can be used to assess the proportion of useful reads in an RNA-seq experiment. That proportion can be reduced by the presence of intron sequences, especially if depletion of ribosomal RNA was used during sample preparation (<a href="https://doi.org/10.1038/nrg3642" target="_blank">Sims et al. 2014</a>). It can also be reduced by off-target transcripts, which are detected in greater numbers at the sequencing depths needed to detect poorly-expressed transcripts (<a href="https://doi.org/10.1101/gr.124321.111" target="_blank">Tarazona et al. 2011</a>).''' self.add_section( name='Genomic origin of reads', anchor='qualimap-reads-genomic-origin', description= 'Classification of mapped reads as originating in exonic, intronic or intergenic regions. These can be displayed as either the number or percentage of mapped reads.', helptext=genomic_origin_helptext, plot=bargraph.plot(self.qualimap_rnaseq_genome_results, gorigin_cats, gorigin_pconfig)) if len(self.qualimap_rnaseq_cov_hist) > 0: coverage_profile_helptext = ''' There are currently three main approaches to map reads to transcripts in an RNA-seq experiment: mapping reads to a reference genome to identify expressed transcripts that are annotated (and discover those that are unknown), mapping reads to a reference transcriptome, and <i>de novo</i> assembly of transcript sequences (<a href="https://doi.org/10.1186/s13059-016-0881-8" target="_blank">Conesa et al. 2016</a>). For RNA-seq QC analysis, QualiMap can be used to assess alignments produced by the first of these approaches. For input, it requires a GTF annotation file along with a reference genome, which can be used to reconstruct the exon structure of known transcripts. QualiMap uses this information to calculate the depth of coverage along the length of each annotated transcript. For a set of reads mapped to a transcript, the depth of coverage at a given base position is the number of high-quality reads that map to the transcript at that position (<a href="https://doi.org/10.1038/nrg3642" target="_blank">Sims et al. 2014</a>). QualiMap calculates coverage depth at every base position of each annotated transcript. To enable meaningful comparison between transcripts, base positions are rescaled to relative positions expressed as percentage distance along each transcript (*0%, 1%, …, 99%*). For the set of transcripts with at least one mapped read, QualiMap plots the cumulative mapped-read depth (y-axis) at each relative transcript position (x-axis). This plot shows the gene coverage profile across all mapped transcripts for each read dataset. It provides a visual way to assess positional biases, such as an accumulation of mapped reads at the 3′ end of transcripts, which may indicate poor RNA quality in the original sample (<a href="https://doi.org/10.1186/s13059-016-0881-8" target="_blank">Conesa et al. 2016</a>).''' self.add_section( name='Gene Coverage Profile', anchor='qualimap-genome-fraction-coverage', description= 'Mean distribution of coverage depth across the length of all mapped transcripts.', helptext=coverage_profile_helptext, plot=linegraph.plot( self.qualimap_rnaseq_cov_hist, { 'id': 'qualimap_gene_coverage_profile', 'title': 'Qualimap RNAseq: Coverage Profile Along Genes (total)', 'ylab': 'Coverage', 'xlab': 'Transcript Position (%)', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x} bp</b>: {point.y:.0f}%', })) #### General Stats self.general_stats_headers['5_3_bias'] = { 'title': "5'-3' bias", 'format': '{:,.2f}', } self.general_stats_headers['reads_aligned'] = { 'title': '{} Aligned'.format(config.read_count_prefix), 'description': 'Reads Aligned ({})'.format(config.read_count_desc), 'min': 0, 'scale': 'RdBu', 'shared_key': 'read_count', 'modify': lambda x: x * config.read_count_multiplier } # Return the number of reports we found return len(self.qualimap_rnaseq_genome_results.keys())
def plot_bargraph (self, data, cats=None, pconfig={}): """ Depreciated function. Forwards to new location. """ from multiqc.plots import bargraph return bargraph.plot(data, cats, pconfig)
def bowtie2_alignment_plot(self): """ Make the HighCharts HTML to plot the alignment rates """ half_warning = '' for s_name in self.bowtie2_data: if 'paired_aligned_mate_one_halved' in self.bowtie2_data[ s_name] or 'paired_aligned_mate_multi_halved' in self.bowtie2_data[ s_name] or 'paired_aligned_mate_none_halved' in self.bowtie2_data[ s_name]: half_warning = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>' description_text = 'This plot shows the number of reads aligning to the reference in different ways.' # Config for the plot config = { 'ylab': '# Reads', 'cpswitch_counts_label': 'Number of Reads' } # Two plots, don't mix SE with PE if self.num_se > 0: sekeys = OrderedDict() sekeys['unpaired_aligned_one'] = { 'color': '#20568f', 'name': 'SE mapped uniquely' } sekeys['unpaired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'SE multimapped' } sekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'SE not aligned' } config['id'] = 'bowtie2_se_plot' config['title'] = 'Bowtie 2: SE Alignment Scores' self.add_section(description=description_text, helptext=''' There are 3 possible types of alignment: * **SE Mapped uniquely**: Read has only one occurence in the reference genome. * **SE Multimapped**: Read has multiple occurence. * **SE No aligned**: Read has no occurence. ''', plot=bargraph.plot(self.bowtie2_data, sekeys, config)) if self.num_pe > 0: pekeys = OrderedDict() pekeys['paired_aligned_one'] = { 'color': '#20568f', 'name': 'PE mapped uniquely' } pekeys['paired_aligned_discord_one'] = { 'color': '#5c94ca', 'name': 'PE mapped discordantly uniquely' } pekeys['paired_aligned_mate_one_halved'] = { 'color': '#95ceff', 'name': 'PE one mate mapped uniquely' } pekeys['paired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'PE multimapped' } pekeys['paired_aligned_discord_multi'] = { 'color': '#dce333', 'name': 'PE discordantly multimapped' } pekeys['paired_aligned_mate_multi_halved'] = { 'color': '#ffeb75', 'name': 'PE one mate multimapped' } pekeys['paired_aligned_mate_none_halved'] = { 'color': '#981919', 'name': 'PE neither mate aligned' } config['id'] = 'bowtie2_pe_plot' config['title'] = 'Bowtie 2: PE Alignment Scores' self.add_section( description="<br>".join([description_text, half_warning]), helptext=''' There are 6 possible types of alignment: * **PE mapped uniquely**: Pair has only one occurence in the reference genome. * **PE mapped discordantly uniquely**: Pair has only one occurence but not in proper pair. * **PE one mate mapped uniquely**: One read of a pair has one occurence. * **PE multimapped**: Pair has multiple occurence. * **PE one mate multimapped**: One read of a pair has multiple occurence. * **PE neither mate aligned**: Pair has no occurence. ''', plot=bargraph.plot(self.bowtie2_data, pekeys, config))