Example #1
0
 def bcbio_mirna_stats(self):
     bcbio_data = list()
     fns = self.find_log_files('bcbio/seqbuster')
     mirs_data = defaultdict(dict)
     mirs_key = OrderedDict()
     iso_data = defaultdict(dict)
     iso_key = OrderedDict()
     for f in fns:
         s_name = self.clean_s_name(f['fn'], root=None)
         with open(os.path.join(f['root'], f['fn'])) as in_handle:
             for line in in_handle:
                 cols = line.strip().split()
                 if line.startswith("mirs_"):
                     mirs_key[cols[0]] = {'name': cols[0].replace("_", " ")}
                     mirs_data[s_name][cols[0]] = int(cols[1])
                 if line.startswith("iso_"):
                     iso_key[cols[0]] = {'name': cols[0].replace("_", " ")}
                     iso_data[s_name][cols[0]] = int(cols[1])
     self.write_data_file(mirs_data, "seqbuster_mirs")
     self.write_data_file(iso_data, "seqbuster_isomirs")
     if mirs_data:
         cnfg = {'ylab': '# of miRNAs'}
         cnfg['title'] = "Number of miRNAs with changes"
         self.mirs = bargraph.plot(mirs_data, mirs_key, cnfg)
     if iso_data:
         cnfg = {'ylab': '# of isomiRs'}
         cnfg['title'] = "Number of isomiRs with changes"
         self.iso = bargraph.plot(iso_data, iso_key, cnfg)
Example #2
0
    def bowtie2_alignment_plot (self):
        """ Make the HighCharts HTML to plot the alignment rates """

        half_warning = ''
        for s_name in self.bowtie2_data:
            if 'paired_aligned_mate_one_halved' in self.bowtie2_data[s_name] or 'paired_aligned_mate_multi_halved' in self.bowtie2_data[s_name] or 'paired_aligned_mate_none_halved' in self.bowtie2_data[s_name]:
                half_warning = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>'
        description_text = 'This plot shows the number of reads aligning to the reference in different ways.'

        # Config for the plot
        config = {
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        # Two plots, don't mix SE with PE
        if self.num_se > 0:
            sekeys = OrderedDict()
            sekeys['unpaired_aligned_one'] = { 'color': '#20568f', 'name': 'SE mapped uniquely' }
            sekeys['unpaired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'SE multimapped' }
            sekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'SE not aligned' }
            config['id'] = 'bowtie2_se_plot'
            config['title'] = 'Bowtie 2: SE Alignment Scores'
            self.add_section(
                description = description_text,
                helptext = '''
                There are 3 possible types of alignment:
                * **SE Mapped uniquely**: Read has only one occurence in the reference genome.
                * **SE Multimapped**: Read has multiple occurence.
                * **SE No aligned**: Read has no occurence.
                ''',
                plot = bargraph.plot(self.bowtie2_data, sekeys, config)
            )

        if self.num_pe > 0:
            pekeys = OrderedDict()
            pekeys['paired_aligned_one'] = { 'color': '#20568f', 'name': 'PE mapped uniquely' }
            pekeys['paired_aligned_discord_one'] = { 'color': '#5c94ca', 'name': 'PE mapped discordantly uniquely' }
            pekeys['paired_aligned_mate_one_halved'] = { 'color': '#95ceff', 'name': 'PE one mate mapped uniquely' }
            pekeys['paired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'PE multimapped' }
            pekeys['paired_aligned_discord_multi'] = { 'color': '#dce333', 'name': 'PE discordantly multimapped' }
            pekeys['paired_aligned_mate_multi_halved'] = { 'color': '#ffeb75', 'name': 'PE one mate multimapped' }
            pekeys['paired_aligned_mate_none_halved'] = { 'color': '#981919', 'name': 'PE neither mate aligned' }
            config['id'] = 'bowtie2_pe_plot'
            config['title'] = 'Bowtie 2: PE Alignment Scores'
            self.add_section(
                description = "<br>".join([description_text,half_warning]),
                helptext = '''
                There are 6 possible types of alignment:
                * **PE mapped uniquely**: Pair has only one occurence in the reference genome.
                * **PE mapped discordantly uniquely**: Pair has only one occurence but not in proper pair.
                * **PE one mate mapped uniquely**: One read of a pair has one occurence.
                * **PE multimapped**: Pair has multiple occurence.
                * **PE one mate multimapped**: One read of a pair has multiple occurence.
                * **PE neither mate aligned**: Pair has no occurence.
                ''',
                plot = bargraph.plot(self.bowtie2_data, pekeys, config)
            )
Example #3
0
    def hisat2_alignment_plot (self):
        """ Make the HighCharts HTML to plot the alignment rates """

        # Split the data into SE and PE
        sedata = {}
        pedata = {}
        for s_name, data in self.hisat2_data.items():
            if 'paired_total' in data:
                # Save half 'pairs' of mate counts
                m_keys = ['unpaired_total', 'unpaired_aligned_none', 'unpaired_aligned_one', 'unpaired_aligned_multi']
                for k in m_keys:
                    if k in data:
                        data[k] = float(data[k]) / 2.0
                pedata[s_name] = data
            else:
                sedata[s_name] = data

        # Two plots, don't mix SE with PE
        if len(sedata) > 0:
            sekeys = OrderedDict()
            sekeys['unpaired_aligned_one'] = { 'color': '#20568f', 'name': 'SE mapped uniquely' }
            sekeys['unpaired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'SE multimapped' }
            sekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'SE not aligned' }
            pconfig = {
                'id': 'hisat2_se_plot',
                'title': 'HISAT2: SE Alignment Scores',
                'ylab': '# Reads',
                'cpswitch_counts_label': 'Number of Reads'
            }
            self.add_section(
                plot = bargraph.plot(sedata, sekeys, pconfig)
            )

        if len(pedata) > 0:
            pekeys = OrderedDict()
            pekeys['paired_aligned_one'] = { 'color': '#20568f', 'name': 'PE mapped uniquely' }
            pekeys['paired_aligned_discord_one'] = { 'color': '#5c94ca', 'name': 'PE mapped discordantly uniquely' }
            pekeys['unpaired_aligned_one'] = { 'color': '#95ceff', 'name': 'PE one mate mapped uniquely' }
            pekeys['paired_aligned_multi'] = { 'color': '#f7a35c', 'name': 'PE multimapped' }
            pekeys['unpaired_aligned_multi'] = { 'color': '#ffeb75', 'name': 'PE one mate multimapped' }
            pekeys['unpaired_aligned_none'] = { 'color': '#981919', 'name': 'PE neither mate aligned' }
            pconfig = {
                'id': 'hisat2_pe_plot',
                'title': 'HISAT2: PE Alignment Scores',
                'ylab': '# Reads',
                'cpswitch_counts_label': 'Number of Reads'
            }
            self.add_section(
                description = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>',
                plot = bargraph.plot(pedata, pekeys, pconfig)
            )
Example #4
0
    def bbt_simple_plot(self):
        """ Makes a simple bar plot with summed alignment counts for
        each species, stacked. """

        # First, sum the different types of alignment counts
        data = OrderedDict()
        cats = OrderedDict()
        for s_name in self.bbt_data:
            data[s_name] = OrderedDict()
            for org in self.bbt_data[s_name]:
                data[s_name][org] = self.bbt_data[s_name][org]['hits'] - self.bbt_data[s_name][org]['shared']
                if org not in cats and org != 'multiMatch' and org != 'noMatch':
                    if org.lower().endswith('.fa'):
                        cname = org[:-3]
                    elif org.lower().endswith('.fasta'):
                        cname = org[:-6]
                    else:
                        cname = org
                    cats[org] = { 'name': cname }

        pconfig = {
            'id': 'biobloom_tools',
            'title': 'BioBloom Tools: Alignment counts per species',
            'ylab': 'Number of hits',
            'hide_zero_cats': False
        }
        cats['multiMatch'] = { 'name': 'Multiple Genomes', 'color': '#820000' }
        cats['noMatch'] = { 'name': 'No Match', 'color': '#cccccc' }

        return bargraph.plot(data, cats, pconfig)
Example #5
0
 def summary_plot(data):
     """Barplot of combined pairs"""
     cats = OrderedDict()
     cats = {
         'inniepairs': {
             'name': 'Combined innie pairs',
             'color': '#191970'
         },
         'outiepairs': {
             'name': 'Combined outie pairs',
             'color': '#00A08A'
         },
         'uncombopairs': {
             'name': 'Uncombined pairs',
             'color': '#cd1076'
         },
         'discardpairs': {
             'name': 'Discarded pairs',
             'color': '#ffd700'
         }
     }
     splotconfig = {'id': 'flash_combo_stats_plot',
                    'title': 'FLASh: Read combination statistics',
                    'ylab': 'Number of read pairs',
                    'hide_zero_cats': False }
     return bargraph.plot(data, cats, splotconfig)
Example #6
0
    def tag_info_chart (self):

        """ Make the taginfo.txt plot """

        ## TODO: human chrs on hg19. How will this work with GRCh genome or other, non human, genomes?
        # nice if they are ordered by size
        ucsc = ["chr" + str(i) for i in range(1,23)].append([ "chrX", "chrY", "chrM"])
        ensembl = list(range(1,23)).append([ "X", "Y", "MT"])
        pconfig = {
            'id': 'tagInfo',
            'title': 'Homer: Tag Info Distribution',
            'ylab': 'Tags',
            'cpswitch_counts_label': 'Number of Tags'
        }

        ## check if chromosomes starts with "chr" (UCSC) or "#" (ensembl)
        sample1 = next(iter(self.tagdir_data['taginfo_total']))
        chrFormat = next(iter(self.tagdir_data['taginfo_total'][sample1]))

        if ("chr" in chrFormat):
            chrs = ucsc
        else:
            chrs = ensembl

        return bargraph.plot(self.tagdir_data['taginfo_total'], chrs, pconfig)
Example #7
0
    def theta2_purities_chart (self):
        """ Make the plot showing alignment rates """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['proportion_germline'] =   { 'name': 'Germline' }
        keys['proportion_tumour_1'] =   { 'name': 'Tumour Subclone 1' }
        keys['proportion_tumour_2'] =   { 'name': 'Tumour Subclone 2' }
        keys['proportion_tumour_3'] =   { 'name': 'Tumour Subclone 3' }
        keys['proportion_tumour_4'] =   { 'name': 'Tumour Subclone 4' }
        keys['proportion_tumour_5'] =   { 'name': 'Tumour Subclone 5' }
        keys['proportion_tumour_gt5'] = { 'name': 'Tumour Subclones > 5' }

        # Config for the plot
        pconfig = {
            'id': 'theta2_purity_plot',
            'title': 'THetA2: Tumour Subclone Purities',
            'cpswitch': False,
            'ymin': 0,
            'ymax': 100,
            'ylab': '% Purity',
            'tt_suffix': '%'
        }

        return bargraph.plot(self.theta2_data, keys, pconfig)
Example #8
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name', c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name = modname,
            anchor = mod['config'].get('section_anchor', c_id),
            href = mod['config'].get('section_href'),
            info = mod['config'].get('description')
        )

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section( plot = table.plot(mod['data'], headers, pconfig) )
            self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) )

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) )

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section( plot = linegraph.plot(mod['data'], pconfig) )

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section( plot = scatter.plot(mod['data'], pconfig) )

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) )

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section( plot = beeswarm.plot(mod['data'], pconfig) )

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section( content = mod['data'] )

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section( content = mod['data'] )

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
Example #9
0
    def star_genecount_chart (self):
        """ Make a plot for the ReadsPerGene output """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['N_genes'] =        { 'color': '#2f7ed8', 'name': 'Overlapping Genes' }
        keys['N_noFeature'] =    { 'color': '#0d233a', 'name': 'No Feature' }
        keys['N_ambiguous'] =    { 'color': '#492970', 'name': 'Ambiguous Features' }
        keys['N_multimapping'] = { 'color': '#f28f43', 'name': 'Multimapping' }
        keys['N_unmapped'] =     { 'color': '#7f0000', 'name': 'Unmapped' }

        # Config for the plot
        pconfig = {
            'id': 'star_gene_counts',
            'title': 'STAR: Gene Counts',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads',
            'data_labels': ['Unstranded','Same Stranded','Reverse Stranded']
        }
        datasets = [
            self.star_genecounts_unstranded,
            self.star_genecounts_first_strand,
            self.star_genecounts_second_strand
        ]
        return bargraph.plot(datasets, [keys,keys,keys,keys], pconfig)
Example #10
0
    def hicup_truncating_chart (self):
        """ Generate the HiCUP Truncated reads plot """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['Not_Truncated_Reads'] = { 'color': '#2f7ed8', 'name': 'Not Truncated' }
        keys['Truncated_Read']      = { 'color': '#0d233a', 'name': 'Truncated' }

        # Construct a data structure for the plot - duplicate the samples for read 1 and read 2
        data = {}
        for s_name in self.hicup_data:
            data['{} Read 1'.format(s_name)] = {}
            data['{} Read 2'.format(s_name)] = {}
            data['{} Read 1'.format(s_name)]['Not_Truncated_Reads'] = self.hicup_data[s_name]['Not_Truncated_Reads_1']
            data['{} Read 2'.format(s_name)]['Not_Truncated_Reads'] = self.hicup_data[s_name]['Not_Truncated_Reads_2']
            data['{} Read 1'.format(s_name)]['Truncated_Read'] = self.hicup_data[s_name]['Truncated_Read_1']
            data['{} Read 2'.format(s_name)]['Truncated_Read'] = self.hicup_data[s_name]['Truncated_Read_2']

        # Config for the plot
        config = {
            'id': 'hicup_truncated_reads_plot',
            'title': 'HiCUP: Truncated Reads',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        return bargraph.plot(data, keys, config)
Example #11
0
    def mirtrace_contamination_check(self):
        """ Generate the miRTrace Contamination Check"""

        # A library of 24 colors. Should be enough for this plot
        color_lib = ['rgb(166,206,227)', 'rgb(31,120,180)', 'rgb(178,223,138)', 'rgb(51,160,44)', 'rgb(251,154,153)', 'rgb(227,26,28)', 'rgb(253,191,111)', 'rgb(255,127,0)', 'rgb(202,178,214)', 'rgb(106,61,154)', 'rgb(255,255,153)', 'rgb(177,89,40)', 'rgb(141,211,199)', 'rgb(255,255,179)', 'rgb(190,186,218)', 'rgb(251,128,114)', 'rgb(128,177,211)', 'rgb(253,180,98)', 'rgb(179,222,105)', 'rgb(252,205,229)', 'rgb(217,217,217)', 'rgb(188,128,189)', 'rgb(204,235,197)', 'rgb(255,237,111)']

        idx = 0

        # Specify the order of the different possible categories
        keys = OrderedDict()
        for clade in self.contamination_data[list(self.contamination_data.keys())[0]]:
            keys[clade] = { 'color': color_lib[idx], 'name': clade }
            if idx < 23:
                idx += 1
            else:
                idx = 0

        # Config for the plot
        config = {
            'cpswitch_c_active': False,
            'id': 'mirtrace_contamination_check_plot',
            'title': 'miRTrace: Contamination Check',
            'ylab': '# miRNA detected',
            'cpswitch_counts_label': 'Number of detected miRNA'
        }

        return bargraph.plot(self.contamination_data, keys, config)
Example #12
0
    def hicup_alignment_chart (self):
        """ Generate the HiCUP Aligned reads plot """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['Unique_Alignments_Read']   = { 'color': '#2f7ed8', 'name': 'Unique Alignments' }
        keys['Multiple_Alignments_Read'] = { 'color': '#492970', 'name': 'Multiple Alignments' }
        keys['Failed_To_Align_Read']     = { 'color': '#0d233a', 'name': 'Failed To Align' }
        keys['Too_Short_To_Map_Read']    = { 'color': '#f28f43', 'name': 'Too short to map' }

        # Construct a data structure for the plot - duplicate the samples for read 1 and read 2
        data = {}
        for s_name in self.hicup_data:
            data['{} Read 1'.format(s_name)] = {}
            data['{} Read 2'.format(s_name)] = {}
            data['{} Read 1'.format(s_name)]['Unique_Alignments_Read'] = self.hicup_data[s_name]['Unique_Alignments_Read_1']
            data['{} Read 2'.format(s_name)]['Unique_Alignments_Read'] = self.hicup_data[s_name]['Unique_Alignments_Read_2']
            data['{} Read 1'.format(s_name)]['Multiple_Alignments_Read'] = self.hicup_data[s_name]['Multiple_Alignments_Read_1']
            data['{} Read 2'.format(s_name)]['Multiple_Alignments_Read'] = self.hicup_data[s_name]['Multiple_Alignments_Read_2']
            data['{} Read 1'.format(s_name)]['Failed_To_Align_Read'] = self.hicup_data[s_name]['Failed_To_Align_Read_1']
            data['{} Read 2'.format(s_name)]['Failed_To_Align_Read'] = self.hicup_data[s_name]['Failed_To_Align_Read_2']
            data['{} Read 1'.format(s_name)]['Too_Short_To_Map_Read'] = self.hicup_data[s_name]['Too_Short_To_Map_Read_1']
            data['{} Read 2'.format(s_name)]['Too_Short_To_Map_Read'] = self.hicup_data[s_name]['Too_Short_To_Map_Read_2']

        # Config for the plot
        config = {
            'id': 'hicup_mapping_stats_plot',
            'title': 'HiCUP: Mapping Statistics',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        return bargraph.plot(data, keys, config)
Example #13
0
    def rsem_mapped_reads_plot(self):
        """ Make the rsem assignment rates plot """

        # Plot categories
        keys = OrderedDict()
        keys['Unique'] =      { 'color': '#437bb1', 'name': 'Aligned uniquely to a gene' }
        keys['Multi'] =       { 'color': '#e63491', 'name': 'Aligned to multiple genes' }
        keys['Filtered'] =    { 'color': '#b1084c', 'name': 'Filtered due to too many alignments' }
        keys['Unalignable'] = { 'color': '#7f0000', 'name': 'Unalignable reads' }

        # Config for the plot
        config = {
            'id': 'rsem_assignment_plot',
            'title': 'RSEM: Mapped reads',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads',
            'hide_zero_cats': False
        }

        self.add_section(
            name = 'Mapped Reads',
            anchor = 'rsem_mapped_reads',
            description = 'A breakdown of how all reads were aligned for each sample.',
            plot = bargraph.plot(self.rsem_mapped_data, keys, config)
        )
Example #14
0
    def bowtie_alignment_plot (self):
        """ Make the HighCharts HTML to plot the alignment rates """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['reads_aligned'] = { 'color': '#8bbc21', 'name': 'Aligned' }
        keys['multimapped'] =   { 'color': '#2f7ed8', 'name': 'Multimapped' }
        keys['not_aligned'] =   { 'color': '#0d233a', 'name': 'Not aligned' }

        # Config for the plot
        config = {
            'id': 'bowtie1_alignment',
            'title': 'Bowtie 1: Alignment Scores',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        self.add_section(
            description = 'This plot shows the number of reads aligning to the reference in different ways.',
            helptext = '''
            There are 3 possible types of alignment:
            * **Aligned**: Read has only one occurence in the reference genome.
            * **Multimapped**: Read has multiple occurence.
            * **Not aligned**: Read has no occurence.
            ''',
            plot = bargraph.plot(self.bowtie_data, keys, config)
        )
Example #15
0
    def slamdunkUtrRatesPlot (self):
        """ Generate the UTR rates plot """

        cats = OrderedDict()
        keys = ['T>C', 'A>T', 'A>G', 'A>C', 'T>A', 'T>G', 'G>A', 'G>T', 'G>C', 'C>A', 'C>T', 'C>G']
        for i, v in enumerate(keys):
            cats[v] = { 'color': self.plot_cols[i] }

        pconfig = {
            'id': 'slamdunk_utrratesplot',
            'title': 'Slamdunk: Overall conversion rates per UTR',
            'cpswitch': False,
            'cpswitch_c_active': False,
            'ylab': 'Number of conversions',
            'stacking': 'normal',
            'tt_decimals': 2,
            'tt_suffix': '%',
            'tt_percentages': False,
            'hide_zero_cats': False
        }

        self.add_section (
            name = 'Conversion rates per UTR',
            anchor = 'slamdunk_utr_rates',
            description = """This plot shows the individual conversion rates for all UTRs
                        (see the <a href="http://t-neumann.github.io/slamdunk/docs.html#utrrates" target="_blank">slamdunk docs</a>).""",
            plot = bargraph.plot(self.utrates_data, cats, pconfig)
        )
Example #16
0
    def hicpro_capture_chart (self):
        """ Generate Capture Hi-C plot"""

        keys = OrderedDict()
        keys['valid_pairs_on_target_cap_cap'] = { 'color': '#0039e6', 'name': 'Capture-Capture interactions' }
        keys['valid_pairs_on_target_cap_rep']  = { 'color': '#809fff', 'name': 'Capture-Reporter interactions' }
        keys['valid_pairs_off_target'] = { 'color': '#cccccc', 'name': 'Off-target valid pairs' }
  
        # Check capture info are available
        num_samples = 0
        for s_name in self.hicpro_data:
            for k in keys:
                num_samples += sum([1 if k in self.hicpro_data[s_name] else 0])
        if num_samples == 0:
            return False

        # Config for the plot
        config = {
            'id': 'hicpro_cap_plot',
            'title': 'HiC-Pro: Capture Statistics',
            'ylab': '# Pairs',
            'cpswitch_counts_label': 'Number of Pairs'
        }

        return bargraph.plot(self.hicpro_data, keys, config)
Example #17
0
    def hicpro_as_chart (self):
        """ Generate Allele-specific plot"""
        
        keys = OrderedDict()
        keys['Valid_pairs_from_ref_genome_(1-1)'] = { 'color': '#e6550d', 'name': 'Genome1 specific read pairs (1-1)' }
        keys['Valid_pairs_from_ref_genome_with_one_unassigned_mate_(0-1/1-0)'] = { 'color': '#fdae6b', 'name': 'Genome1 with one unassigned mate (0-1/1-0)' }
        keys['Valid_pairs_from_alt_genome_(2-2)']  = { 'color': '#756bb1', 'name': 'Genome2 specific read pairs (2-2)' }
        keys['Valid_pairs_from_alt_genome_with_one_unassigned_mate_(0-2/2-0)'] = { 'color': '#bcbddc', 'name': 'Genome2 with one unassigned mate (0-2/2-0)' }
        keys['Valid_pairs_from_alt_and_ref_genome_(1-2/2-1)'] = { 'color': '#a6611a', 'name': 'Trans homologuous read pairs (1-2/2/1)' }
        keys['Valid_pairs_with_both_unassigned_mated_(0-0)'] = { 'color': '#cccccc', 'name': 'Unassigned read pairs' }
        keys['Valid_pairs_with_at_least_one_conflicting_mate_(3-)'] = { 'color': '#a9a2a2', 'name': 'Conflicting read pairs' }

        # check allele-specific analysis was run
        num_samples = 0
        for s_name in self.hicpro_data:
            for k in keys:
                num_samples += sum([1 if k in self.hicpro_data[s_name] else 0])
        if num_samples == 0:
            return False
        
        # Config for the plot
        config = {
            'id': 'hicpro_asan_plot',
            'title': 'HiC-Pro: Allele-specific Statistics',
            'ylab': '# Pairs',
            'cpswitch_counts_label': 'Number of Pairs'
        }

        return bargraph.plot(self.hicpro_data, keys, config)
Example #18
0
    def hicpro_mapping_chart (self):
        """ Generate the HiC-Pro Aligned reads plot """

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['Full_Alignments_Read']   = { 'color': '#005ce6', 'name': 'Full reads Alignments' }
        keys['Trimmed_Alignments_Read'] = { 'color': '#3385ff', 'name': 'Trimmed reads Alignments' }
        keys['Failed_To_Align_Read']     = { 'color': '#a9a2a2', 'name': 'Failed To Align' }
                               
        data = [{},{}]
        for s_name in self.hicpro_data:
            for r in [1,2]:
                data[r-1]['{} [R{}]'.format(s_name, r)] = {
                    'Full_Alignments_Read': self.hicpro_data[s_name]['global_R{}'.format(r)],
                    'Trimmed_Alignments_Read': self.hicpro_data[s_name]['local_R{}'.format(r)],
                    'Failed_To_Align_Read': int(self.hicpro_data[s_name]['total_R{}'.format(r)]) - int(self.hicpro_data[s_name]['mapped_R{}'.format(r)])
                }
        
        # Config for the plot
        config = {
            'id': 'hicpro_mapping_stats_plot',
            'title': 'HiC-Pro: Mapping Statistics',
            'ylab': '# Reads',
            'ylab': '# Reads: Read 1',
            'data_labels': [
                {'name': 'Read 1', 'ylab': '# Reads: Read 1'},
                {'name': 'Read 2', 'ylab': '# Reads: Read 2'}
            ]
        }

        return bargraph.plot(data, [keys, keys], config)
Example #19
0
    def transcript_associated_plot (self):
        """ Plot a bargraph showing the Transcript-associated reads  """

        # Plot bar graph of groups
        keys = OrderedDict()
        keys['Exonic Rate'] = { 'name': 'Exonic', 'color': '#2f7ed8' }
        keys['Intronic Rate'] = { 'name': 'Intronic', 'color': '#8bbc21' }
        keys['Intergenic Rate'] = { 'name': 'Intergenic', 'color': '#0d233a'}

        # Config for the plot
        pconfig = {
            'id': 'rna_seqc_position_plot',
            'title': 'RNA-SeQC: Transcript-associated reads',
            'ylab': 'Ratio of Reads',
            'cpswitch': False,
            'ymax': 1,
            'ymin': 0,
            'tt_decimals': 3,
            'cpswitch_c_active': False
        }
        self.add_section (
            name = 'Transcript-associated reads',
            anchor = 'Transcript_associated',
            helptext = 'All of the above rates are per mapped read. Exonic Rate is the fraction mapping within exons. '
                       'Intronic Rate is the fraction mapping within introns. '
                       'Intergenic Rate is the fraction mapping in the genomic space between genes. ',
            plot = bargraph.plot(self.rna_seqc_metrics, keys, pconfig)
        )
Example #20
0
    def parse_samtools_rmdup(self):
        """ Find Samtools rmdup logs and parse their data """

        self.samtools_rmdup = dict()
        for f in self.find_log_files('samtools/rmdup', filehandles=True):
            # Example below:
            # [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library '   '
            dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
            s_name = f['s_name']
            for l in f['f']:
                match = re.search(dups_regex, l)
                if match:
                    library_name = match.group(4).strip()
                    if library_name != '':
                        s_name = library_name
                    if s_name in self.samtools_rmdup:
                        log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f['fn'], s_name))
                    self.add_data_source(f, s_name)
                    self.samtools_rmdup[s_name] = dict()
                    self.samtools_rmdup[s_name]['n_dups'] = int(match.group(1))
                    self.samtools_rmdup[s_name]['n_tot'] = int(match.group(2))
                    self.samtools_rmdup[s_name]['n_unique'] = int(match.group(2)) - int(match.group(1))
                    self.samtools_rmdup[s_name]['pct_dups'] = float(match.group(3))*100

        # Filter to strip out ignored sample names
        self.samtools_rmdup = self.ignore_samples(self.samtools_rmdup)

        if len(self.samtools_rmdup) > 0:
            # Write parsed report data to a file
            self.write_data_file(self.samtools_rmdup, 'multiqc_samtools_rmdup')

            # Make a bar plot showing duplicates
            keys = OrderedDict()
            keys['n_unique'] = {'name': 'Non-duplicated reads'}
            keys['n_dups'] = {'name': 'Duplicated reads'}
            pconfig = {
                'id': 'samtools_rmdup_plot',
                'title': 'Samtools rmdup: Duplicate alignments',
                'yDecimals': False
            }
            self.add_section (
                name = 'Duplicates removed',
                anchor = 'samtools-rmdup',
                plot = bargraph.plot(self.samtools_rmdup, keys, pconfig)
            )

            # Add a column to the General Stats table
            # General Stats Table
            stats_headers = OrderedDict()
            stats_headers['pct_dups'] = {
                'title': '% Dups',
                'description': 'Percent of duplicate alignments',
                'min': 0,
                'max': 100,
                'suffix': '%',
                'scale': 'OrRd'
            }
            self.general_stats_addcols(self.samtools_rmdup, stats_headers, 'Samtools rmdup')

        return len(self.samtools_rmdup)
Example #21
0
    def chart_align_strand(self):

        # mapping strand distribution
        pd1 = {}
        pd2 = {}
        for sid, dd in self.mdata['align_strand'].items():
            pd1[sid] = dd['read1']
            pd2[sid] = dd['read2']

        self.add_section(
            name='Mapping Strand Distribution',
            anchor='biscuit-strands',
            description = "This plot shows the distribution of strand of mapping and strand of bisulfite conversion.",
            helptext="Most bisulfite libraries has read 1 goes to parent `++` or `--` and read 2 goes to daughter/synthesized `+-` or `-+`. PBAT or most single-cell/low input libraries typically don't observe this rule.",
            plot = bargraph.plot([pd1, pd2], 
                [OrderedDict([
                    ('++', {'name':'++: Waston-Aligned, Waston-Bisulfite Conversion', 'color': '#F53855'}),
                    ('+-', {'name':'+-: Waston-Aligned, Crick-Bisulfite Conversion', 'color': '#E37B40'}),
                    ('-+', {'name':'-+: Crick-Aligned, Waston-Bisulfite Conversion', 'color': '#46B29D'}),
                    ('--', {'name':'--: Crick-Aligned, Crick-Bisulfite Conversion', 'color': '#324D5C'}),]),
                OrderedDict([
                    ('++', {'name':'++: Waston-Aligned, Waston-Bisulfite Conversion', 'color': '#F53855'}),
                    ('+-', {'name':'+-: Waston-Aligned, Crick-Bisulfite Conversion', 'color': '#E37B40'}),
                    ('-+', {'name':'-+: Crick-Aligned, Waston-Bisulfite Conversion', 'color': '#46B29D'}),
                    ('--', {'name':'--: Crick-Aligned, Crick-Bisulfite Conversion', 'color': '#324D5C'})])], 
                {'id':'biscuit_strands',
                 'title':'BISCUIT: Mapping Strand Distribution',
                 'ylab':'Number of Reads',
                 'cpswitch_c_active': True,
                 'cpswitch_counts_label': '# Reads',
                 'data_labels': [
                    {'name': 'Read 1', },
                    {'name': 'Read 2', }]
            })
        )
Example #22
0
    def macs_filtered_reads_plot(self):
        """ Plot of filtered reads for control and treatment samples """
        data = dict()
        req_cats = ['control_fragments_total', 'control_fragments_after_filtering', 'treatment_fragments_total', 'treatment_fragments_after_filtering']
        for s_name, d in self.macs_data.items():
            if all([c in d for c in req_cats]):
                data['{}: Control'.format(s_name)] = dict()
                data['{}: Treatment'.format(s_name)] = dict()
                data['{}: Control'.format(s_name)]['fragments_filtered'] = d['control_fragments_total'] - d['control_fragments_after_filtering']
                data['{}: Control'.format(s_name)]['fragments_not_filtered'] = d['control_fragments_after_filtering']
                data['{}: Treatment'.format(s_name)]['fragments_filtered'] = d['treatment_fragments_total'] - d['treatment_fragments_after_filtering']
                data['{}: Treatment'.format(s_name)]['fragments_not_filtered'] = d['treatment_fragments_after_filtering']

        # Specify the order of the different possible categories
        keys = OrderedDict()
        keys['fragments_not_filtered'] = { 'color': '#437BB1', 'name': 'Remaining fragments' }
        keys['fragments_filtered'] =     { 'color': '#B1084C', 'name': 'Filtered fragments' }

        # Config for the plot
        pconfig = {
            'id': 'macs2_filtered',
            'title': 'MACS2: Filtered Fragments',
            'ylab': '# Fragments',
            'cpswitch_counts_label': 'Number of Fragments',
            'hide_zero_cats': False
        }

        self.add_section(
            plot = bargraph.plot(data, keys, pconfig)
        )
Example #23
0
    def adapter_removal_retained_chart(self):

        pconfig = {
            'title': 'Adapter Removal: Discarded Reads',
            'id': 'ar_retained_plot',
            'ylab': '# Reads',
            'hide_zero_cats': False,
            'cpswitch_counts_label': 'Number of Reads'
        }

        cats_pec = OrderedDict()

        if self.__any_paired:
            cats_pec['retained_reads'] = {'name': 'Retained Read Pairs'}

        cats_pec['singleton_m1'] = {'name': 'Singleton R1'}

        if self.__any_paired:
            cats_pec['singleton_m2'] = {'name': 'Singleton R2'}

            if self.__any_collapsed:
                cats_pec['full-length_cp'] = {'name': 'Full-length Collapsed Pairs'}
                cats_pec['truncated_cp'] = {'name': 'Truncated Collapsed Pairs'}

        cats_pec['discarded_m1'] = {'name': 'Discarded R1'}

        if self.__any_paired:
            cats_pec['discarded_m2'] = {'name': 'Discarded R2'}

        self.add_section(
            name='Retained and Discarded Paired-End Collapsed',
            anchor='adapter_removal_retained_plot',
            description='The number of retained and discarded reads.',
            plot=bargraph.plot(self.adapter_removal_data, cats_pec, pconfig)
        )
Example #24
0
    def qorts_splice_loci_barplot (self):
        """ Make the HighCharts HTML to plot the qorts splice loci """
        # Specify the order of the different possible categories
        keys = [
            'SpliceLoci_Known_ManyReads',
            'SpliceLoci_Known_FewReads',
            'SpliceLoci_Known_NoReads',
            'SpliceLoci_Novel_ManyReads',
            'SpliceLoci_Novel_FewReads',
        ]
        cats = OrderedDict()
        for k in keys:
            name = k.replace('SpliceLoci_', '').replace('_',': ')
            name = re.sub("([a-z])([A-Z])","\g<1> \g<2>",name)
            cats[k] = { 'name': name }

        # Config for the plot
        pconfig = {
            'id': 'qorts_splice_loci',
            'title': 'QoRTs: Splice Loci',
            'ylab': '# Splice Loci',
            'cpswitch_counts_label': 'Number of Splice Loci',
            'hide_zero_cats': False
        }

        self.add_section(
            name = "Splice Loci",
            description = "This plot shows the number of splice junction loci of each type that appear in the sample's reads.",
            helptext = '''
            The [QoRTs vignette](http://hartleys.github.io/QoRTs/doc/QoRTs-vignette.pdf) describes the categories in this plot as follows:
            
            * **Known**: The splice junction locus is found in the supplied transcript annotation gtf file.
            * **Novel**: The splice junction locus is NOT found in the supplied transcript annotation gtf file.
            * **Known: Few reads**: The locus is known, and is only covered by 1-3 read-pairs.
            * **Known: Many reads**: The locus is known, and is covered by 4 or more read-pairs.
            * **Novel: Few reads**: The locus is novel, and is only covered by 1-3 read-pairs.
            * **Novel: Many reads**: The locus is novel, and is covered by 4 or more read-pairs
            
            _What it means and what to look for:_
            
            This plot can be used to detect a number of anomalies. For example:
            whether mapping or sequencing artifacts caused a disproportionate discovery of novel splice junctions in
            one sample or batch. It can also be used as an indicator of the comprehensiveness the genome annotation.
            Replicates that are obvious outliers may have sequencing/technical issues causing false detection of splice
            junctions.
            
            Abnormalities in the splice junction rates are generally a symptom of larger issues which will generally be
            picked up by other metrics. Numerous factors can reduce the efficacy by which aligners map across splice
            junctions, and as such these plots become very important if the intended downstream analyses include
            transcript assembly, transcript deconvolution, differential splicing, or any other form of analysis that in
            some way involves the splice junctions themselves. These plots can be used to assess whether other minor
            abnormalities observed in the other plots are of sufficient severity to impact splice junction mapping and
            thus potentially compromise such analyses.
            ''',
            plot = bargraph.plot(self.qorts_data, cats, pconfig)
        )
Example #25
0
    def chart_align_mapq(self):

        # fraction of optimally mapped reads
        pd = {}
        for sid, dd in self.mdata['align_mapq'].items():
            pd[sid] = {'OAligned':0, 'SAligned':0, 'UAligned':1}
            for mapq, cnt in dd.items():
                if mapq == 'unmapped':
                    pd[sid]['UAligned'] += int(cnt)
                elif int(mapq) >= 40:
                    pd[sid]['OAligned'] += int(cnt)
                else:
                    pd[sid]['SAligned'] += int(cnt)

        self.add_section(
            name = 'Mapping Summary',
            anchor = 'biscuit-mapping',
            description = 'This shows the fraction of optimally aligned reads, which is defined by mapQ >= 40.',
            helptext = 'A good library should have high fraction of reads optimally aligned. Suboptimally aligned reads include both nonunique alignments and imperfect alignments.',
            plot = bargraph.plot(pd, OrderedDict([
                ('OAligned', {'name':'Optimally Aligned Reads'}),
                ('SAligned', {'name':'Suboptimally Aligned Reads'}),
                ('UAligned', {'name':'Unaligned Reads'})
            ]), {'id':'biscuit_mapping_summary',
                 'title':'BISCUIT: Mapping Summary',
                 'ylab':'Number of Reads',
                 'cpswitch_counts_label': '# Reads'
            })
        )

        # Mapping quality together in one plot
        total = {}
        for sid, dd in self.mdata['align_mapq'].items():
            total[sid] = sum([int(cnt) for _, cnt in dd.items() if _ != "unmapped"])

        pd_mapping = {}
        for sid, dd in self.mdata['align_mapq'].items():
            mapqcnts = []
            for mapq in range(61):
                if str(mapq) in dd:
                    mapqcnts.append(float(dd[str(mapq)])/total[sid]*100)
                else:
                    mapqcnts.append(0)
            pd_mapping[sid] = dict(zip(range(61), mapqcnts))

        self.add_section(
            name = 'Mapping Quality Distribution',
            anchor = 'biscuit-mapq',
            description = "This plot shows the distribution of primary mapping quality.",
            plot = linegraph.plot(pd_mapping,
                {'id':'biscuit_mapping',
                 'title': 'BISCUIT: Mapping Information', 
                 'ymin': 0, 'yLabelFormat': '{value}%', 
                 'tt_label': '<strong>Q{point.x}:</strong> {point.y:.2f}% of reads',
                 'name':'Mapping Quality', 'ylab': '% Primary Mapped Reads','xlab': 'Mapping Quality'}))
Example #26
0
    def read_count_plot (self):
        """ Stacked bar plot showing counts of reads """
        pconfig = {
            'id': 'fastqc_sequence_counts_plot',
            'title': 'FastQC: Sequence Counts',
            'ylab': 'Number of reads',
            'cpswitch_counts_label': 'Number of reads',
            'hide_zero_cats': False
        }
        pdata = dict()
        has_dups = False
        has_total = False
        for s_name in self.fastqc_data:
            pd = self.fastqc_data[s_name]['basic_statistics']
            pdata[s_name] = dict()
            try:
                pdata[s_name]['Duplicate Reads'] = int(((100.0 - float(pd['total_deduplicated_percentage']))/100.0) * pd['Total Sequences'])
                pdata[s_name]['Unique Reads'] = pd['Total Sequences'] - pdata[s_name]['Duplicate Reads']
                has_dups = True
            except KeyError:
                # Older versions of FastQC don't have duplicate reads
                pdata[s_name] = { 'Total Sequences': pd['Total Sequences'] }
                has_total = True
        pcats = list()
        duptext = ''
        if has_total:
            pcats.append('Total Sequences')
        if has_dups:
            pcats.extend(['Unique Reads', 'Duplicate Reads'])
            duptext = ' Duplicate read counts are an estimate only.'
        if has_total and not has_dups:
            pconfig['use_legend'] = False
            pconfig['cpswitch'] = False
        self.add_section (
            name = 'Sequence Counts',
            anchor = 'fastqc_sequence_counts',
            description = 'Sequence counts for each sample.'+duptext,
            helptext = '''
            This plot show the total number of reads, broken down into unique and duplicate
            if possible (only more recent versions of FastQC give duplicate info).

            You can read more about duplicate calculation in the
            [FastQC documentation](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/8%20Duplicate%20Sequences.html).
            A small part has been copied here for convenience:

            _Only sequences which first appear in the first 100,000 sequences
            in each file are analysed. This should be enough to get a good impression
            for the duplication levels in the whole file. Each sequence is tracked to
            the end of the file to give a representative count of the overall duplication level._

            _The duplication detection requires an exact sequence match over the whole length of
            the sequence. Any reads over 75bp in length are truncated to 50bp for this analysis._
            ''',
            plot = bargraph.plot(pdata, pcats, pconfig)
        )
Example #27
0
    def add_barplot(self):
        """ Generate the Samblaster bar plot. """
        cats = OrderedDict()
        cats['n_nondups'] = {'name': 'Non-duplicates'}
        cats['n_dups'] = {'name': 'Duplicates'}

        pconfig = {
            'id': 'samblaster_duplicates',
            'title': 'Samblaster: Number of duplicate reads',
        }
        self.add_section( plot = bargraph.plot(self.samblaster_data, cats, pconfig) )
Example #28
0
    def featureCounts_chart (self):
        """ Make the featureCounts assignment rates plot """

        # Config for the plot
        config = {
            'id': 'featureCounts_assignment_plot',
            'title': 'featureCounts: Assignments',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        return bargraph.plot(self.featurecounts_data, self.featurecounts_keys, config)
Example #29
0
    def qorts_alignment_barplot (self):
        """ Alignment statistics bar plot """
        # Specify the order of the different possible categories
        keys = [
            'ReadPairs_UniqueGene_CDS',
            'ReadPairs_UniqueGene_UTR',
            'ReadPairs_AmbigGene',
            'ReadPairs_NoGene_Intron',
            'ReadPairs_NoGene_OneKbFromGene',
            'ReadPairs_NoGene_TenKbFromGene',
            'ReadPairs_NoGene_MiddleOfNowhere'
        ]
        cats = OrderedDict()
        for k in keys:
            name = k.replace('ReadPairs_', '').replace('_',': ')
            name = re.sub("([a-z])([A-Z])","\g<1> \g<2>",name)
            cats[k] = { 'name': name }

        # Config for the plot
        pconfig = {
            'id': 'qorts_alignments',
            'title': 'QoRTs: Alignment Locations',
            'ylab': '# Read Pairs',
            'cpswitch_counts_label': 'Number of Read Pairs',
            'hide_zero_cats': False
        }

        self.add_section(
            name = "Alignments",
            description = "This plot displays the rate for which the sample's read-pairs are assigned to the different categories.",
            helptext = '''
            The [QoRTs vignette](http://hartleys.github.io/QoRTs/doc/QoRTs-vignette.pdf) describes the categories in this plot as follows:
            
            * **Unique Gene**: The read-pair overlaps with the exonic segments of one and only one gene. For many
              downstream analyses tools, such as DESeq, DESeq2 and EdgeR, only read-pairs in this category
              are used.
            * **Ambig Gene**: The read-pair overlaps with the exons of more than one gene.
            * **No Gene: Intronic**: The read-pair does not overlap with the exons of any annotated gene, but appears
              in a region that is bridged by an annotated splice junction.
            * **No Gene: One kb From Gene**: The read-pair does not overlap with the exons of any annotated gene, but is
              within 1 kilobase from the nearest annotated gene.
            * **No Gene: Ten kb From Gene**: The read-pair does not overlap with the exons of any annotated gene, but
              is within 10 kilobases from the nearest annotated gene.
            * **No Gene: Middle Of Nowhere**: The read-pair does not overlap with the exons of any annotated gene,
              and is more than 10 kilobases from the nearest annotated gene.
            
            _What it means and what to look for:_
            
            Outliers in these plots can indicate biological variations or the presence of large mapping problems.
            They may also suggest the presence of large, highly-expressed, unannotated transcripts or genes.
            ''',
            plot = bargraph.plot(self.qorts_data, cats, pconfig)
        )
Example #30
0
    def overrepresented_sequences (self):
        """Sum the percentages of overrepresented sequences and display them in a bar plot"""

        data = dict()
        for s_name in self.fastqc_data:
            data[s_name] = dict()
            try:
                max_pcnt   = max( [ float(d['percentage']) for d in self.fastqc_data[s_name]['overrepresented_sequences']] )
                total_pcnt = sum( [ float(d['percentage']) for d in self.fastqc_data[s_name]['overrepresented_sequences']] )
                data[s_name]['total_overrepresented'] = total_pcnt
                data[s_name]['top_overrepresented'] = max_pcnt
                data[s_name]['remaining_overrepresented'] = total_pcnt - max_pcnt
            except KeyError:
                if self.fastqc_data[s_name]['statuses']['overrepresented_sequences'] == 'pass':
                    data[s_name]['total_overrepresented'] = 0
                    data[s_name]['top_overrepresented'] = 0
                    data[s_name]['remaining_overrepresented'] = 0
                else:
                    log.debug("Couldn't find data for {}, invalid Key".format(s_name))

        cats = OrderedDict()
        cats['top_overrepresented'] = { 'name': 'Top over-represented sequence' }
        cats['remaining_overrepresented'] = { 'name': 'Sum of remaining over-represented sequences' }

        # Config for the plot
        pconfig = {
            'id': 'fastqc_overrepresented_sequencesi_plot',
            'title': 'FastQC: Overrepresented sequences',
            'ymin': 0,
            'yCeiling': 100,
            'yMinRange': 20,
            'tt_decimals': 2,
            'tt_suffix': '%',
            'tt_percentages': False,
            'ylab_format': '{value}%',
            'cpswitch': False,
            'ylab': 'Percentage of Total Sequences'
        }

        # Check if any samples have more than 1% overrepresented sequences, else don't make plot.
        if max([ x['total_overrepresented'] for x in data.values()]) < 1:
            plot_html = '<div class="alert alert-info">{} samples had less than 1% of reads made up of overrepresented sequences</div>'.format(len(data))
        else:
            plot_html = bargraph.plot(data, cats, pconfig)

        self.add_section (
            name = 'Overrepresented sequences',
            anchor = 'fastqc_overrepresented_sequences',
            description = 'The total amount of overrepresented sequences found in each library. ' +
                    'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/9%20Overrepresented%20Sequences.html" target="_bkank">FastQC help for further information</a>.',
            plot  = plot_html
        )
Example #31
0
    def reads_by_quality_plot(self):
        """Make the HighCharts HTML to plot the reads by quality"""
        def _get_total_reads(data_dict):
            stat_type = self._stat_types[0]
            for stat_type in self._stat_types:
                total_key = f"Number of reads_{stat_type}"
                if total_key in data_dict:
                    return data_dict[total_key], stat_type
            return None, None

        bar_data = {}
        stat_type = "unrecognized"
        # Order of keys, from >Q5 to >Q15
        _range_names = {
            "&gt;Q5": "&lt;Q5",
            "&gt;Q7": "Q5-7",
            "&gt;Q10": "Q7-10",
            "&gt;Q12": "Q10-12",
            "&gt;Q15": "Q12-15",
            "rest": "&gt;Q15",
        }
        for s_name, data_dict in self.nanostat_data.items():
            reads_total, stat_type = _get_total_reads(data_dict)
            if s_name in bar_data and stat_type == "aligned":
                log.debug(
                    "Sample '{s_name}' duplicated in the quality plot - ignoring aligned data"
                )
                continue
            elif s_name in bar_data and stat_type == "seq summary":
                log.debug(
                    "Sample '{s_name}' duplicated in the quality plot - overwriting with seq summary data"
                )
            bar_data[s_name] = {}

            prev_reads = reads_total
            for k, range_name in _range_names.items():
                if k != "rest":
                    data_key = f"{k}_{stat_type}"
                    reads_gt = data_dict[data_key]

                    bar_data[s_name][range_name] = prev_reads - reads_gt

                    if bar_data[s_name][range_name] < 0:
                        log.error(
                            f"Error on {s_name} {range_name} {data_key} . Negative number of reads"
                        )
                    prev_reads = reads_gt
                else:
                    data_key = f"&gt;Q15_{stat_type}"
                    bar_data[s_name][range_name] = data_dict[data_key]

        cats = OrderedDict()
        keys = reversed(list(_range_names.values()))
        colours = mqc_colour.mqc_colour_scale("RdYlGn-rev", 0,
                                              len(_range_names))
        for idx, k in enumerate(keys):
            cats[k] = {
                "name": "Reads " + k,
                "color": colours.get_colour(idx, lighten=1)
            }

        # Config for the plot
        config = {
            "id": "nanostat_quality_dist",
            "title": "NanoStat: Reads by quality",
            "ylab": "# Reads",
            "cpswitch_counts_label": "Number of Reads",
        }

        # Add the report section
        self.add_section(
            name="Reads by quality",
            anchor=f"nanostat_read_qualities",
            description=
            "Read counts categorised by read quality (phred score).",
            helptext="""
                Sequencing machines assign each generated read a quality score using the
                [Phred scale](https://en.wikipedia.org/wiki/Phred_quality_score).
                The phred score represents the liklelyhood that a given read contains errors.
                So, high quality reads have a high score.

                Data may come from NanoPlot reports generated with sequencing summary files or alignment stats.
                If a sample has data from both, the sequencing summary is preferred.
            """,
            plot=bargraph.plot(bar_data, cats, config),
        )
Example #32
0
    def quast_predicted_genes_barplot(self):
        """
        Make a bar plot showing the number and length of predicted genes
        for each assembly
        """

        # Prep the data
        # extract the ranges given to quast with "--gene-thresholds"
        prefix = '# predicted genes (>= '
        suffix = ' bp)'
        all_thresholds = sorted(
            list(
                set([
                    int(key[len(prefix):-len(suffix)])
                    for _, d in self.quast_data.items() for key in d.keys()
                    if key.startswith(prefix)
                ])))

        data = {}
        ourpat = '>= {}{} bp'
        theirpat = prefix + "{}" + suffix
        for s_name, d in self.quast_data.items():
            thresholds = sorted(
                list(
                    set([
                        int(key[len(prefix):-len(suffix)])
                        for _, x in self.quast_data.items()
                        for key in x.keys() if key.startswith(prefix)
                    ])))
            if len(thresholds) < 2: continue

            p = dict()
            try:
                p = {
                    ourpat.format(thresholds[-1], ""):
                    d[theirpat.format(thresholds[-1])]
                }
                for low, high in zip(thresholds[:-1], thresholds[1:]):
                    p[ourpat.format(
                        low, -high
                    )] = d[theirpat.format(low)] - d[theirpat.format(high)]

                assert sum(p.values()) == d[theirpat.format(0)]
            except AssertionError:
                log.warning(
                    "Predicted gene counts didn't add up properly for \"{}\"".
                    format(s_name))
            except KeyError:
                log.warning(
                    "Not all predicted gene thresholds available for \"{}\"".
                    format(s_name))
            data[s_name] = p

        cats = [
            ourpat.format(low, -high if high else "")
            for low, high in zip(all_thresholds, all_thresholds[1:] + [None])
        ]

        if len(cats) > 0:
            return bargraph.plot(data, cats)
        else:
            return None
Example #33
0
    def add_kraken(self):
        data = {}

        # First, we figure out all possible names
        kingdoms = set([
            x for k in self.sequana_data.keys()
            for x in self.sequana_data[k].keys()
        ])

        colors = [
            'Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'Metazoa', 'Fungi',
            "Unclassified", "Classified"
        ]

        for sample_name in self.sequana_data.keys():
            for kingdom in sorted(kingdoms):
                if kingdom not in self.sequana_data[sample_name]:
                    self.sequana_data[sample_name][kingdom] = 0

            data[sample_name] = {"others": 0}
            for kingdom in sorted(kingdoms):

                if kingdom not in colors:
                    # here we add together non-superkingdom + other artifical
                    # sequences
                    data[sample_name]["others"] += \
                         self._set_nan_to_zero(self.sequana_data[sample_name][kingdom])
                else:
                    data[sample_name][kingdom.lower()] = \
                         self._set_nan_to_zero(self.sequana_data[sample_name][kingdom])
            data[sample_name]['unclassified'] = \
                self._set_nan_to_zero(self.sequana_data[sample_name]['Unclassified'])

        pconfig = {
            "title": "Taxonomy by kingdom",
            #"percentages": True,
            "cpswitch": False,
            "min": 0,
            "max": 100,
            "format": '{0:.2f}',
            "logswitch": False,
        }

        keys = OrderedDict()
        # superkingdom:
        keys['archea'] = {'color': 'orange', 'name': 'Archea'}
        keys['bacteria'] = {'color': '#b1084c', 'name': 'Bacteria'}
        keys['eukaryota'] = {'color': 'green', 'name': 'Eukaryota'}
        keys['viruses'] = {'color': '#437bb1', 'name': 'Viruses'}
        # kingdom:
        keys['metazoa'] = {'color': 'green', 'name': 'Metazoa'}
        keys['fungi'] = {'color': 'purple', 'name': 'Fungi'}
        # others
        keys['unclassified'] = {'color': 'grey', 'name': 'Unclassified'}
        keys['others'] = {'color': 'blue', 'name': 'Others'}
        # subkingdom
        #keys['viridiplantae'] = {'color': 'yellow', 'name': 'Viridiplantae'}
        #keys['dikarya'] = {'color': 'brown', 'name': 'dikarya'}

        self.add_section(
            name='Taxonomy by kingdom',
            anchor='taxonomy',
            description=
            'The following barplots summarizes the kraken analysis for each sample. ',
            helptext="",
            plot=bargraph.plot(data, keys, pconfig))
def parse_reports(self):
    """ Find Sentieon AlignmentSummaryMetrics reports and parse their data """

    # Set up vars
    self.sentieon_alignment_metrics = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files("sentieon/alignment_metrics",
                                 filehandles=True):
        parsed_data = dict()
        s_name = None
        keys = None
        for l in f["f"]:
            # New log starting
            if s_name is None and "AlignmentStat" in l:
                keys = None
                # Pull sample name from filename
                s_name = os.path.basename(f["s_name"])
                s_name = self.clean_s_name(s_name, f["root"])
                parsed_data[s_name] = dict()

            if s_name is not None:
                if "AlignmentStat" in l and "#SentieonCommandLine" in l:
                    keys = f["f"].readline().strip("\n").split("\t")
                elif keys:
                    vals = l.strip("\n").split("\t")
                    if len(vals) == len(keys):
                        # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data
                        # to simplify things
                        if vals[0] == "PAIR" or vals[0] == "UNPAIRED":
                            for i, k in enumerate(keys):
                                try:
                                    parsed_data[s_name][k] = float(vals[i])
                                except ValueError:
                                    parsed_data[s_name][k] = vals[i]
                    else:
                        s_name = None
                        keys = None

        # Remove empty dictionaries
        for s_name in list(parsed_data.keys()):
            if len(parsed_data[s_name]) == 0:
                parsed_data.pop(s_name, None)

        # Manipulate sample names if multiple baits found
        for s_name in parsed_data.keys():
            if s_name in self.sentieon_alignment_metrics:
                log.debug("Duplicate sample name found in {}!\
                          Overwriting: {}".format(f["fn"], s_name))
            self.add_data_source(f, s_name, section="AlignmentSummaryMetrics")
            self.sentieon_alignment_metrics[s_name] = parsed_data[s_name]

    # Filter to strip out ignored sample names
    self.sentieon_alignment_metrics = self.ignore_samples(
        self.sentieon_alignment_metrics)

    if len(self.sentieon_alignment_metrics) > 0:

        # Write parsed data to a file
        self.write_data_file(self.sentieon_alignment_metrics,
                             "multiqc_sentieon_AlignmentSummaryMetrics")

        # Add to general stats table
        self.general_stats_headers["PCT_PF_READS_ALIGNED"] = {
            "title": "% Aligned",
            "description": "Percent of aligned reads",
            "max": 100,
            "min": 0,
            "suffix": "%",
            "format": "{:,.0f}",
            "scale": "RdYlGn",
            "modify": lambda x: self.multiply_hundred(x),
        }
        for s_name in self.sentieon_alignment_metrics:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(
                self.sentieon_alignment_metrics[s_name])

        # Make the bar plot of alignment read count
        pdata = dict()
        for s_name in self.sentieon_alignment_metrics.keys():
            pdata[s_name] = dict()
            # Sentieon reports both reads for PE data.
            # Divide it by two as most people will expect # clusters
            if self.sentieon_alignment_metrics[s_name]["CATEGORY"] == "PAIR":
                pdata[s_name]["total_reads"] = self.sentieon_alignment_metrics[
                    s_name]["TOTAL_READS"] / 2
                pdata[s_name][
                    "aligned_reads"] = self.sentieon_alignment_metrics[s_name][
                        "PF_READS_ALIGNED"] / 2
            else:
                pdata[s_name]["total_reads"] = self.sentieon_alignment_metrics[
                    s_name]["TOTAL_READS"]
                pdata[s_name][
                    "aligned_reads"] = self.sentieon_alignment_metrics[s_name][
                        "PF_READS_ALIGNED"]
                pdata[s_name]["unaligned_reads"] = pdata[s_name][
                    "total_reads"] - pdata[s_name]["aligned_reads"]

        keys = OrderedDict()
        keys["aligned_reads"] = {"name": "Aligned Reads"}
        keys["unaligned_reads"] = {"name": "Unaligned Reads"}

        # Config for the plot
        pconfig = {
            "id": "sentieon_aligned_reads",
            "title": "Sentieon: Aligned Reads",
            "ylab": "# Reads",
            "cpswitch_counts_label": "Number of Reads",
        }

        self.add_section(
            name="Alignment Summary",
            anchor="sentieon-alignmentsummary",
            description="Please note that Sentieon's read counts are divided \
                 by two for paired-end data.",
            plot=bargraph.plot(pdata, keys, pconfig),
        )

    # Return the number of detected samples to the parent module
    return len(self.sentieon_alignment_metrics)
Example #35
0
    def __init__(self):
        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name='bcl2fastq',
            anchor='bcl2fastq',
            href="https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html",
            info="can be used to both demultiplex data and convert BCL files"
                 " to FASTQ file formats for downstream analysis."
        )

        # Gather data from all json files
        self.bcl2fastq_data = dict()
        for myfile in self.find_log_files('bcl2fastq'):
            self.parse_file_as_json(myfile)

        # Collect counts by lane and sample (+source_files)
        self.bcl2fastq_bylane = dict()
        self.bcl2fastq_bysample = dict()
        self.bcl2fastq_bysample_lane = dict()
        self.source_files = dict()
        self.split_data_by_lane_and_sample()

        # Filter to strip out ignored sample names
        self.bcl2fastq_bylane = self.ignore_samples(self.bcl2fastq_bylane)
        self.bcl2fastq_bysample = self.ignore_samples(self.bcl2fastq_bysample)
        self.bcl2fastq_bysample_lane = self.ignore_samples(self.bcl2fastq_bysample_lane)

        # Return with Warning if no files are found
        if len(self.bcl2fastq_bylane) == 0 and len(self.bcl2fastq_bysample) == 0:
            raise UserWarning

        # Print source files
        for s in self.source_files.keys():
            self.add_data_source(
                s_name=s,
                source=",".join(list(set(self.source_files[s]))),
                module='bcl2fastq',
                section='bcl2fastq-bysample'
            )

        # Add sample counts to general stats table
        self.add_general_stats()
        self.write_data_file(
            {str(k): self.bcl2fastq_bylane[k] for k in self.bcl2fastq_bylane.keys()},
            'multiqc_bcl2fastq_bylane'
        )
        self.write_data_file(self.bcl2fastq_bysample, 'multiqc_bcl2fastq_bysample')

        # Add section for summary stats per flow cell
        self.add_section (
            name = 'Lane Statistics',
            anchor = 'bcl2fastq-lanestats',
            description = 'Statistics about each lane for each flowcell',
            plot = self.lane_stats_table()
        )

        # Add section for counts by lane
        cats = OrderedDict()
        cats["perfect"] = {'name': 'Perfect Index Reads'}
        cats["imperfect"] = {'name': 'Mismatched Index Reads'}
        cats["undetermined"] = {'name': 'Undetermined Reads'}
        self.add_section (
            name = 'Clusters by lane',
            anchor = 'bcl2fastq-bylane',
            description = 'Number of reads per lane (with number of perfect index reads).',
            helptext = """Perfect index reads are those that do not have a single mismatch.
                All samples of a lane are combined. Undetermined reads are treated as a third category.""",
            plot = bargraph.plot(
                self.get_bar_data_from_counts(self.bcl2fastq_bylane),
                cats,
                {
                    'id': 'bcl2fastq_lane_counts',
                    'title': 'bcl2fastq: Clusters by lane',
                    'ylab': 'Number of clusters',
                    'hide_zero_cats': False
                }
            )
        )

        # Add section for counts by sample
        # get cats for per-lane tab
        lcats = set()
        for s_name in self.bcl2fastq_bysample_lane:
            lcats.update(self.bcl2fastq_bysample_lane[s_name].keys())
        lcats = sorted(list(lcats))
        self.add_section (
            name = 'Clusters by sample',
            anchor = 'bcl2fastq-bysample',
            description = 'Number of reads per sample.',
            helptext = """Perfect index reads are those that do not have a single mismatch.
                All samples are aggregated across lanes combinned. Undetermined reads are ignored.
                Undetermined reads are treated as a separate sample.""",
            plot = bargraph.plot(
                [
                    self.get_bar_data_from_counts(self.bcl2fastq_bysample),
                    self.bcl2fastq_bysample_lane
                ],
                [cats, lcats],
                {
                    'id': 'bcl2fastq_sample_counts',
                    'title': 'bcl2fastq: Clusters by sample',
                    'hide_zero_cats': False,
                    'ylab': 'Number of clusters',
                    'data_labels': ['Index mismatches', 'Counts per lane']
                }
            )
        )

        # Add section with undetermined barcodes
        self.add_section(
            name = "Undetermined barcodes by lane",
            anchor = "undetermine_by_lane",
            description = "Count of the top twenty most abundant undetermined barcodes by lanes",
            plot = bargraph.plot(
                self.get_bar_data_from_undetermined(self.bcl2fastq_bylane),
                None,
                {
                    'id': 'bcl2fastq_undetermined',
                    'title': 'bcl2fastq: Undetermined barcodes by lane',
                    'ylab': 'Count',
                    'tt_percentages': False,
                    'use_legend': True,
                    'tt_suffix': 'reads'
                }
            )
        )
Example #36
0
    def bargraph(self, json, bps):

        # config dict for bar graph
        config = {
            "title":
            "HTStream: QWindowTrim Trimmed Basepairs Bargraph",
            'id':
            "htstream_qwindowtrimmer_bargraph",
            'ylab':
            "Samples",
            'cpswitch_c_active':
            False,
            'data_labels': [{
                'name': "Read 1"
            }, {
                'name': "Read 2"
            }, {
                'name': "Single End"
            }]
        }

        if len(json.keys()) > 150:
            html = '<div class="alert alert-info"> Too many samples for bargraph. </div>'
            return html

        html = ""

        r1_data = {}
        r2_data = {}
        se_data = {}

        for key in json:

            r1_data[key] = {
                "LT_R1": json[key]["Qt_Left_Trimmed_R1"],
                "RT_R1": json[key]["Qt_Right_Trimmed_R1"]
            }

            r2_data[key] = {
                "LT_R2": json[key]["Qt_Left_Trimmed_R2"],
                "RT_R2": json[key]["Qt_Right_Trimmed_R2"]
            }

            se_data[key] = {
                "LT_SE": json[key]["Qt_Left_Trimmed_SE"],
                "RT_SE": json[key]["Qt_Right_Trimmed_SE"]
            }

        # returns nothing if no reads were trimmed.
        if bps == 0:
            html = '<div class="alert alert-info"> No basepairs were trimmed from any sample. </div>'
            return html

        cats = [OrderedDict(), OrderedDict(), OrderedDict()]
        cats[0]["LT_R1"] = {'name': 'Left Trimmmed'}
        cats[0]["RT_R1"] = {'name': 'Right Trimmmed'}
        cats[1]["LT_R2"] = {'name': 'Left Trimmmed'}
        cats[1]["RT_R2"] = {'name': 'Right Trimmmed'}
        cats[2]["LT_SE"] = {'name': 'Left Trimmmed'}
        cats[2]["RT_SE"] = {'name': 'Right Trimmmed'}

        return bargraph.plot([r1_data, r2_data, se_data], cats, config)
Example #37
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name='Supernova',
            anchor='supernova',
            href="https://www.10xgenomics.com/",
            info="is a de novo genome assembler 10X Genomics linked-reads.")

        # Headers for the supernova Table
        self.headers = OrderedDict()
        self.headers['Asm size'] = {
            'description':
            'assembly size (in megabases) ;only scaffolds >= 10 kb',
            'modify': lambda x: x / 1000000.0,
            'suffix': 'Mb',
            'scale': 'YlGn'
        }
        self.headers['# Long scaffs'] = {
            'description': 'number of scaffolds >= 10 kb',
            'scale': 'YlGn',
            'format': '{:,.0f}',
        }
        self.headers['Scaff N50'] = {
            'description': 'N50 scaffold size (in kilobases)',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'RdYlGn'
        }
        self.headers['Phase N50'] = {
            'description': 'N50 phase block size (in kilobases)',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'RdYlGn',
            'hidden': True
        }
        self.headers['Contig N50'] = {
            'description': 'N50 contig size (in kilobases)',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'RdYlGn',
            'hidden': True
        }
        self.headers['Edge N50'] = {
            'description': 'N50 edge size (in kilobases)',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'RdYlGn',
            'hidden': True
        }
        self.headers['Mol size'] = {
            'description':
            'weighted mean molecule size (in kilobases); ideal 50-100',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'BuGn'
        }
        self.headers['Read len'] = {
            'description':
            'mean read length (in bases) after trimming; ideal 140',
            'suffix': 'b',
            'scale': 'PuBu',
            'format': '{:,.0f}',
            'hidden': True
        }
        self.headers['# Reads'] = {
            'description':
            'number of reads (in millions); ideal 800M-1200M for human',
            'modify': lambda x: x / 1000000.0,
            'suffix': 'M',
            'scale': 'PuBu',
        }
        self.headers['Coverage'] = {
            'description':
            'effective read coverage; ideal ~42 for nominal 56x cov',
            'suffix': 'x',
            'scale': 'PuBu'
        }
        self.headers['% Dup'] = {
            'description': 'fraction of reads that are duplicates',
            'suffix': '%',
            'scale': 'OrRd',
        }
        self.headers['% R2 Q30'] = {
            'description': 'fraction of Q30 bases in read 2; ideal 75-85%',
            'suffix': '%',
            'scale': 'OrRd',
        }
        self.headers['Insert size'] = {
            'description': 'median insert size (in bases); ideal 0.35-0.40 Kb',
            'suffix': 'b',
            'scale': 'OrRd',
            'format': '{:,.0f}',
            'hidden': True
        }
        self.headers['% proper'] = {
            'description': 'fraction of proper read pairs; ideal >= 75%',
            'suffix': '%',
            'scale': 'OrRd',
            'hidden': True
        }
        self.headers['Het dist'] = {
            'description':
            'mean distance between heterozygous SNPs (in kilobases)',
            'modify': lambda x: x / 1000.0,
            'suffix': 'Kb',
            'scale': 'BuGn',
        }
        self.headers['% missing BC'] = {
            'description': 'fraction of reads that are not barcoded',
            'suffix': '%',
            'scale': 'BuGn',
        }
        self.headers['Barcode N50'] = {
            'description': 'N50 reads per barcode (in bases)',
            'suffix': 'b',
            'scale': 'BuGn',
            'format': '{:,.0f}',
        }
        self.headers['% Phased'] = {
            'description': 'nonduplicate and phased reads; ideal 45-50%',
            'suffix': '%',
            'scale': 'BuGn',
            'hidden': True
        }

        reports = OrderedDict()
        summaries = OrderedDict()
        molecules = OrderedDict()
        kmers = OrderedDict()
        root_summary = {}

        ### Parse the input log files
        # report.txt files
        for f in self.find_log_files('supernova/report'):
            log.debug("Found report in: {}".format(f['root']))
            sid, data = self.parse_report(f['f'])
            s_name = self.clean_s_name(sid, f['root'])
            if s_name in reports.keys():
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        s_name))
            reports[s_name] = data
            self.add_data_source(f, s_name=s_name, section='supernova-table')

        # summary.json files
        for f in self.find_log_files('supernova/summary'):
            log.debug("Found summary.json in: {}".format(f['root']))
            try:
                sid, data = self.parse_summary(f['f'])
            except ValueError:
                log.debug("Error parsing JSON file in {}".format(f['root']))
                continue
            except RuntimeError:
                log.debug("Could not find sample_id in JSON file in {}".format(
                    f['root']))
                continue

            s_name = self.clean_s_name(sid, f['root'])
            if s_name in summaries.keys():
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        s_name))
            summaries[s_name] = data
            self.add_data_source(f, s_name=s_name, section='supernova-table')
            # The plot json files do not contain sample IDs, sadly. So we need to store it somewhere.
            root_summary[f['root']] = sid

        # histogram_molecules.json files
        for f in self.find_log_files('supernova/molecules'):
            log.debug("Found histogram_molecules.json in: {}".format(
                f['root']))
            try:
                if f['root'] in root_summary.keys():
                    data = self.parse_histogram(f['f'])
                    sid = root_summary[f['root']]
                    s_name = self.clean_s_name(sid, f['root'])
                    molecules[s_name] = data
                    self.add_data_source(f,
                                         s_name=s_name,
                                         section='supernova-molecules')
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f['root']))
                continue

        # histogram_kmer_count.json files
        for f in self.find_log_files('supernova/kmers'):
            log.debug("Found histogram_kmer_count.json in: {}".format(
                f['root']))
            try:
                if f['root'] in root_summary.keys():
                    data = self.parse_histogram(f['f'], 400)
                    sid = root_summary[f['root']]
                    s_name = self.clean_s_name(sid, f['root'])
                    kmers[s_name] = data
                    self.add_data_source(f,
                                         s_name=s_name,
                                         section='supernova-kmers')
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f['root']))
                continue

        # Data from summary.json supersedes data from report.txt
        for sample_id, sum_data in summaries.items():
            if sample_id in reports.keys():
                log.debug(
                    "Found summary data for sample {} which supersedes report data"
                    .format(sample_id))
                reports[sample_id] = sum_data
        # Ignore cmd-line specified samples
        reports = self.ignore_samples(reports)
        molecules = self.ignore_samples(molecules)
        kmers = self.ignore_samples(kmers)

        if len(reports) == 0:
            log.debug("Could not find any reports in {}".format(
                config.analysis_dir))
            raise UserWarning
        else:
            log.info("Found {} reports".format(len(reports.keys())))

        ### Write the report
        self.write_data_file(reports, 'multiqc_supernova')

        config_table = {'id': 'supernova_table', 'namespace': 'supernova'}
        self.add_section (
            name = 'Assembly statistics',
            anchor = 'supernova-table',
            description = 'Statistics gathered from the summary report(s) of Supernova. Note! ' \
                    'There are more columns available but they are hidden by default.',
            helptext = 'As a bare minimum these numbers are generated from the file report.txt, ' \
                    'found in the folder `sampleID/outs/`. If available the stats in the report ' \
                    'file will be superseded by the higher precision numbers found in the file ' \
                    '`sampleID/outs/assembly/stats/summary.json`',
            plot = table.plot(reports, self.headers, config_table)
        )

        # N50 barcharts
        n50_cats = [{
            'Scaff N50': {
                'name': 'Scaffold N50',
                'color': '#66c2a5'
            }
        }, {
            'Contig N50': {
                'name': 'Contig N50',
                'color': '#fc8d62'
            }
        }, {
            'Edge N50': {
                'name': 'Edge N50',
                'color': '#8da0cb'
            }
        }, {
            'Phase N50': {
                'name': 'Phase block N50',
                'color': '#e78ac3'
            }
        }]
        config_n50 = {
            'id':
            'supernova_n50',
            'title':
            'Supernova N50 statistics',
            'cpswitch':
            False,
            'data_labels':
            ['Scaffold N50', 'Contig N50', 'Edge N50', 'Phase block N50']
        }
        self.add_section (
            name = 'N50 statistics',
            anchor = 'supernova-n50',
            description = 'Assembly N50 values - the shortest sequence length at 50% of the genome when sorted by size (see [wikipedia](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50)).',
            helptext = "Note that assembly size and N50 values are computed after removing scaffolds &le; 10 kb and do not count `N`s: \n\n" \
                    "* **Scaffold N50** - N50 size of scaffolds in bases, \n" \
                    "* **Contig N50** - N50 size of contigs in bases, \n" \
                    "* **Edge N50** - N50 size of raw graph assembly edges in bases, \n" \
                    "* **Phase block N50** - N50 size of phase blocks in bases. \n\n" \
                    '[(source)](https://support.10xgenomics.com/de-novo-assembly/software/pipelines/latest/output/asm-stats)',
            plot = bargraph.plot([reports,reports,reports,reports], n50_cats, config_n50)
        )

        # Conditional sections
        if len(molecules) > 0:
            # Remove the long tail
            max_x = self.trim_tail(molecules, 100000)
            # Add molecules plot
            config_molecules = {
                'id': 'supernova_molecules',
                'title': 'Supernova Molecule Lengths',
                'xlab': 'Inferred molecule length (bp)',
                'ylab': '# molecules',
                'smooth_points': 300,
                'smooth_points_sumcounts': True,
                'xmax': max_x
            }
            self.add_section (
                name = 'Molecule Lengths',
                anchor = 'supernova-molecules',
                description = 'Shows the inferred molecule lengths of the input 10X library.',
                helptext = 'Inferred in the `patch` step of the Supernova pipeline. It is worth ' \
                        'keeping in mind that the mean molecule length from the report is a length-weighted mean. ' \
                        'See the [source code](https://github.com/10XGenomics/supernova/search?q=lw_mean_mol_len&type=) ' \
                        'for how this value is calculated.',
                plot = linegraph.plot(molecules, config_molecules)
            )
        if len(kmers) > 0:
            # Remove the long tail
            max_x = self.trim_tail(kmers, 50)

            # Add kmers plot
            config_kmers = {
                'id': 'supernova_kmers',
                'title': 'Supernova Kmer Counts',
                'xlab': 'Filtered kmer multiplicity',
                'ylab': 'Counts',
                'smooth_points_sumcounts': False,
                'xmax': max_x
            }
            self.add_section (
                name = 'K-mer counts',
                anchor = 'supernova-kmers',
                description = 'Shows the k-mer frequencies of the input data to Supernova (after filtering).',
                helptext = 'This data is generated from k-merizing the input read data, where the sequences are ' \
                        'transformed in to the set of all possible sub-sequences of a fixed length of `K` (Supernova uses `K=48`). ' \
                        'The plot shows on the x-axis the multiplicity (i.e. how many times are they repeated) of these k-mers ' \
                        'and the y-axis the number of k-mers at this level of multiplicity. ' \
                        'A careful reading of this plot can give some insights into the levels of heterozygosity and repeats ' \
                        'in the genome that was sequenced and indications if the sequencing experiment was successful.',
                plot = linegraph.plot(kmers, config_kmers)
            )
Example #38
0
    def bustools_section(self):
        """Add bargraphs showing the mean UMIs per barcode and percentages in whitelist"""
        # add the summary table
        tconfig = {
            "namespace": "Bustools",
            "id": "bustools_summary",
            "table_title": "Bustools Summary Table"
        }
        self.add_section(
            name="Summary table",
            anchor="bustools-inspect",
            description=
            "This is a table of the complete output of bustools inspect. Note that some columns are hidden by default (click <em>Configure Columns</em> to show).",
            plot=table.plot(self.bustools_data, self.headers, tconfig),
        )

        # also make some nice barplots
        # barplot for mean umis per sample
        mean_umis = {
            sample: {
                "UMIs per barcode": values["meanUMIsPerBarcode"]
            }
            for sample, values in self.bustools_data.items()
        }

        self.add_section(
            name="Mean number of UMIs per barcode",
            anchor="bustools-umis",
            description=
            "Average number of UMIs (unique molecular identifiers) per barcode",
            helptext=
            "Each unique barcode represents a cell and each Unique Molecular Identifier (UMI) represents "
            "a unique transcript molecule. By counting the mean number of UMIs per barcode, you "
            "effectively calculate the average number of unique transcripts per cell.",
            plot=bargraph.plot(
                mean_umis,
                pconfig={
                    "id": "bus_umis",
                    "title":
                    "Bustools: Mean number of UMIs per barcode per sample",
                    "cpswitch": False,
                    "tt_percentages": False,
                    "ylab": "Mean UMIs per barcode",
                },
            ),
        )

        # barplot for the percentage of reads and barcodes on the whitelist
        percentage_whitelist = {
            sample: {
                "Reads on whitelist": values["percentageReadsOnWhitelist"],
                "Barcodes on whitelist":
                values["percentageBarcodesOnWhitelist"],
            }
            for sample, values in self.bustools_data.items()
        }
        self.add_section(
            name="Percentage in whitelist",
            anchor="bustools-reads",
            description=
            "The whitelist is a list of unique barcodes used in your protocol, either provided or inferred from the data.",
            helptext=
            "Each unique barcode from the whitelist represents a cell. The percentage of "
            "reads with barcode / barcodes in the whitelist is a measure of percentage of reads that could "
            "be asigned to a cell.",
            plot=bargraph.plot(
                percentage_whitelist,
                pconfig={
                    "id": "bus_reads",
                    "title":
                    "Bustools: Barcodes / reads with barcodes in the whitelist",
                    "ymax": 100,
                    "ymix": 0,
                    "cpswitch": False,
                    "tt_percentages": False,
                    "ylab":
                    "Percentage of barcodes / reads with barcodes in the whitelist",
                    "stacking": None,
                    "ylab_format": "{value}%",
                },
            ),
        )
Example #39
0
    def hisat2_alignment_plot(self):
        """Make the HighCharts HTML to plot the alignment rates"""

        # Split the data into SE and PE
        sedata = {}
        pedata = {}
        for s_name, data in self.hisat2_data.items():
            if "paired_total" in data:
                # Save half 'pairs' of mate counts
                m_keys = [
                    "unpaired_total", "unpaired_aligned_none",
                    "unpaired_aligned_one", "unpaired_aligned_multi"
                ]
                for k in m_keys:
                    if k in data:
                        data[k] = float(data[k]) / 2.0
                pedata[s_name] = data
            else:
                sedata[s_name] = data

        # Two plots, don't mix SE with PE
        if len(sedata) > 0:
            sekeys = OrderedDict()
            sekeys["unpaired_aligned_one"] = {
                "color": "#20568f",
                "name": "SE mapped uniquely"
            }
            sekeys["unpaired_aligned_multi"] = {
                "color": "#f7a35c",
                "name": "SE multimapped"
            }
            sekeys["unpaired_aligned_none"] = {
                "color": "#981919",
                "name": "SE not aligned"
            }
            pconfig = {
                "id": "hisat2_se_plot",
                "title": "HISAT2: SE Alignment Scores",
                "ylab": "# Reads",
                "cpswitch_counts_label": "Number of Reads",
            }
            self.add_section(plot=bargraph.plot(sedata, sekeys, pconfig))

        if len(pedata) > 0:
            pekeys = OrderedDict()
            pekeys["paired_aligned_one"] = {
                "color": "#20568f",
                "name": "PE mapped uniquely"
            }
            pekeys["paired_aligned_discord_one"] = {
                "color": "#5c94ca",
                "name": "PE mapped discordantly uniquely"
            }
            pekeys["unpaired_aligned_one"] = {
                "color": "#95ceff",
                "name": "PE one mate mapped uniquely"
            }
            pekeys["paired_aligned_multi"] = {
                "color": "#f7a35c",
                "name": "PE multimapped"
            }
            pekeys["unpaired_aligned_multi"] = {
                "color": "#ffeb75",
                "name": "PE one mate multimapped"
            }
            pekeys["unpaired_aligned_none"] = {
                "color": "#981919",
                "name": "PE neither mate aligned"
            }
            pconfig = {
                "id": "hisat2_pe_plot",
                "title": "HISAT2: PE Alignment Scores",
                "ylab": "# Reads",
                "cpswitch_counts_label": "Number of Reads",
            }
            self.add_section(
                description=
                "<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>",
                plot=bargraph.plot(pedata, pekeys, pconfig),
            )
Example #40
0
def parse_reports(self):
    """ Find Picard MarkDuplicates reports and parse their data """

    # Set up vars
    self.picard_dupMetrics_data = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('picard/markdups', filehandles=True):
        s_name = None
        for l in f['f']:
            # New log starting
            if 'markduplicates' in l.lower() and 'input' in l.lower():
                s_name = None

                # Pull sample name from input
                fn_search = re.search(r"INPUT=(\[?[^\s]+\]?)",
                                      l,
                                      flags=re.IGNORECASE)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1).strip('[]'))
                    s_name = self.clean_s_name(s_name, f['root'])
                # When run with GATK this has a different format
                else:
                    fn_search = re.search(r"--input (\[?[^\s]+\]?)",
                                          l,
                                          flags=re.IGNORECASE)
                    if fn_search:
                        s_name = os.path.basename(
                            fn_search.group(1).strip('[]'))
                        s_name = self.clean_s_name(s_name, f['root'])

            if s_name is not None:
                if 'DuplicationMetrics' in l and '## METRICS CLASS' in l:
                    if s_name in self.picard_dupMetrics_data:
                        log.debug(
                            "Duplicate sample name found in {}! Overwriting: {}"
                            .format(f['fn'], s_name))
                    self.add_data_source(f,
                                         s_name,
                                         section='DuplicationMetrics')
                    self.picard_dupMetrics_data[s_name] = dict()
                    keys = f['f'].readline().rstrip("\n").split("\t")
                    vals = f['f'].readline().rstrip("\n").split("\t")
                    for i, k in enumerate(keys):
                        try:
                            self.picard_dupMetrics_data[s_name][k] = float(
                                vals[i])
                        except ValueError:
                            self.picard_dupMetrics_data[s_name][k] = vals[i]
                    # Check that this sample had some reads
                    if self.picard_dupMetrics_data[s_name].get('READ_PAIRS_EXAMINED', 0) == 0 and \
                       self.picard_dupMetrics_data[s_name].get('UNPAIRED_READS_EXAMINED', 0) == 0:
                        self.picard_dupMetrics_data.pop(s_name, None)
                        log.warn(
                            "Skipping MarkDuplicates sample '{}' as log contained no reads"
                            .format(s_name))
                    s_name = None

        for s_name in list(self.picard_dupMetrics_data.keys()):
            if len(self.picard_dupMetrics_data[s_name]) == 0:
                self.picard_dupMetrics_data.pop(s_name, None)
                log.debug("Removing {} as no data parsed".format(s_name))

    # Filter to strip out ignored sample names
    self.picard_dupMetrics_data = self.ignore_samples(
        self.picard_dupMetrics_data)

    if len(self.picard_dupMetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_dupMetrics_data,
                             'multiqc_picard_dups')

        # Add to general stats table
        self.general_stats_headers['PERCENT_DUPLICATION'] = {
            'title': '% Dups',
            'description': 'MarkDuplicates - Percent Duplication',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'OrRd',
            'modify': lambda x: self.multiply_hundred(x)
        }
        for s_name in self.picard_dupMetrics_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(
                self.picard_dupMetrics_data[s_name])

        # Make the bar plot and add to the MarkDuplicates section
        # NOTE: I had a hard time getting these numbers to add up as expected.
        # If you think I've done something wrong, let me know! Please add an
        # issue here: https://github.com/ewels/MultiQC/issues
        for sn in self.picard_dupMetrics_data.keys():
            self.picard_dupMetrics_data[sn][
                'UNPAIRED_READ_UNIQUE'] = self.picard_dupMetrics_data[sn][
                    'UNPAIRED_READS_EXAMINED'] - self.picard_dupMetrics_data[
                        sn]['UNPAIRED_READ_DUPLICATES']
            self.picard_dupMetrics_data[sn][
                'READ_PAIR_NOT_OPTICAL_DUPLICATES'] = self.picard_dupMetrics_data[
                    sn]['READ_PAIR_DUPLICATES'] - self.picard_dupMetrics_data[
                        sn]['READ_PAIR_OPTICAL_DUPLICATES']
            self.picard_dupMetrics_data[sn][
                'READ_PAIR_UNIQUE'] = self.picard_dupMetrics_data[sn][
                    'READ_PAIRS_EXAMINED'] - self.picard_dupMetrics_data[sn][
                        'READ_PAIR_DUPLICATES']

        keys = OrderedDict()
        keys_r = [
            'READ_PAIR_UNIQUE', 'UNPAIRED_READ_UNIQUE',
            'READ_PAIR_NOT_OPTICAL_DUPLICATES', 'READ_PAIR_OPTICAL_DUPLICATES',
            'UNPAIRED_READ_DUPLICATES', 'UNMAPPED_READS'
        ]
        for k in keys_r:
            keys[k] = {'name': k.replace('_', ' ').title()}

        # Config for the plot
        pconfig = {
            'id': 'picard_deduplication',
            'title': 'Picard: Deduplication Stats',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads',
            'cpswitch_c_active': False
        }

        self.add_section(name='Mark Duplicates',
                         anchor='picard-markduplicates',
                         plot=bargraph.plot(self.picard_dupMetrics_data, keys,
                                            pconfig))

    # Return the number of detected samples to the parent module
    return len(self.picard_dupMetrics_data)
Example #41
0
def parse_reports(self):
    """ Find Sentieon AlignmentSummaryMetrics reports and parse their data """

    # Set up vars
    self.sentieon_alignment_metrics = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('sentieon/alignment_metrics',
                                 filehandles=True):
        parsed_data = dict()
        s_name = None
        keys = None
        for l in f['f']:
            # New log starting
            if s_name is None and 'AlignmentStat' in l:
                keys = None
                # Pull sample name from filename
                s_name = os.path.basename(f['s_name'])
                s_name = self.clean_s_name(s_name, f['root'])
                parsed_data[s_name] = dict()

            if s_name is not None:
                if 'AlignmentStat' in l and '#SentieonCommandLine' in l:
                    keys = f['f'].readline().strip("\n").split("\t")
                elif keys:
                    vals = l.strip("\n").split("\t")
                    if len(vals) == len(keys):
                        # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data
                        # to simplify things
                        if vals[0] == 'PAIR' or vals[0] == 'UNPAIRED':
                            for i, k in enumerate(keys):
                                try:
                                    parsed_data[s_name][k] = float(vals[i])
                                except ValueError:
                                    parsed_data[s_name][k] = vals[i]
                    else:
                        s_name = None
                        keys = None

        # Remove empty dictionaries
        for s_name in list(parsed_data.keys()):
            if len(parsed_data[s_name]) == 0:
                parsed_data.pop(s_name, None)

        # Manipulate sample names if multiple baits found
        for s_name in parsed_data.keys():
            if s_name in self.sentieon_alignment_metrics:
                log.debug("Duplicate sample name found in {}!\
                          Overwriting: {}".format(f['fn'], s_name))
            self.add_data_source(f, s_name, section='AlignmentSummaryMetrics')
            self.sentieon_alignment_metrics[s_name] = parsed_data[s_name]

    # Filter to strip out ignored sample names
    self.sentieon_alignment_metrics = self.ignore_samples(
        self.sentieon_alignment_metrics)

    if len(self.sentieon_alignment_metrics) > 0:

        # Write parsed data to a file
        self.write_data_file(self.sentieon_alignment_metrics,
                             'multiqc_sentieon_AlignmentSummaryMetrics')

        # Add to general stats table
        self.general_stats_headers['PCT_PF_READS_ALIGNED'] = {
            'title': '% Aligned',
            'description': 'Percent of aligned reads',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'format': '{:,.0f}',
            'scale': 'RdYlGn',
            'modify': lambda x: self.multiply_hundred(x)
        }
        for s_name in self.sentieon_alignment_metrics:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(
                self.sentieon_alignment_metrics[s_name])

        # Make the bar plot of alignment read count
        pdata = dict()
        for s_name in self.sentieon_alignment_metrics.keys():
            pdata[s_name] = dict()
            # Sentieon reports both reads for PE data.
            # Divide it by two as most people will expect # clusters
            if self.sentieon_alignment_metrics[s_name]['CATEGORY'] == 'PAIR':
                pdata[s_name]['total_reads'] = (
                    self.sentieon_alignment_metrics[s_name]['TOTAL_READS'] / 2)
                pdata[s_name]['aligned_reads'] = (
                    self.sentieon_alignment_metrics[s_name]['PF_READS_ALIGNED']
                    / 2)
            else:
                pdata[s_name]['total_reads'] = (
                    self.sentieon_alignment_metrics[s_name]['TOTAL_READS'])
                pdata[s_name]['aligned_reads'] = (
                    self.sentieon_alignment_metrics[s_name]['PF_READS_ALIGNED']
                )
                pdata[s_name]['unaligned_reads'] = (
                    pdata[s_name]['total_reads'] -
                    pdata[s_name]['aligned_reads'])

        keys = OrderedDict()
        keys['aligned_reads'] = {'name': 'Aligned Reads'}
        keys['unaligned_reads'] = {'name': 'Unaligned Reads'}

        # Config for the plot
        pconfig = {
            'id': 'sentieon_aligned_reads',
            'title': 'Sentieon: Aligned Reads',
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads',
        }

        self.add_section(
            name='Alignment Summary',
            anchor='sentieon-alignmentsummary',
            description="Please note that Sentieon's read counts are divided \
                 by two for paired-end data.",
            plot=bargraph.plot(pdata, keys, pconfig))

    # Return the number of detected samples to the parent module
    return len(self.sentieon_alignment_metrics)
Example #42
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name="Supernova",
            anchor="supernova",
            href="https://www.10xgenomics.com/",
            info="is a de novo genome assembler 10X Genomics linked-reads.",
        )

        # Headers for the supernova Table
        self.headers = OrderedDict()
        self.headers["Asm size"] = {
            "description":
            "assembly size (in megabases) ;only scaffolds >= 10 kb",
            "modify": lambda x: x / 1000000.0,
            "suffix": "Mb",
            "scale": "YlGn",
        }
        self.headers["% missing 10Kb"] = {
            "rid": "pct_missing_10Kb",
            "description":
            "% of base assembly missing from scaffolds >= 10 kb",
            "suffix": "%",
            "scale": "YlGn",
        }
        self.headers["# Long scaffs"] = {
            "rid": "num_long_scaffs",
            "description": "number of scaffolds >= 10 kb",
            "scale": "YlGn",
            "format": "{:,.0f}",
            "hidden": True,
        }
        self.headers["Scaff N50"] = {
            "description": "N50 scaffold size (in kilobases)",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "RdYlGn",
        }
        self.headers["Phase N50"] = {
            "description": "N50 phase block size (in kilobases)",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "RdYlGn",
            "hidden": True,
        }
        self.headers["Contig N50"] = {
            "description": "N50 contig size (in kilobases)",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "RdYlGn",
            "hidden": True,
        }
        self.headers["Edge N50"] = {
            "description": "N50 edge size (in kilobases)",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "RdYlGn",
            "hidden": True,
        }
        self.headers["Mol size"] = {
            "description":
            "weighted mean molecule size (in kilobases); ideal 50-100",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "BuGn",
        }
        self.headers["Read len"] = {
            "description":
            "mean read length (in bases) after trimming; ideal 140",
            "suffix": "b",
            "scale": "PuBu",
            "format": "{:,.0f}",
            "hidden": True,
        }
        self.headers["# Reads"] = {
            "rid": "num_reads",
            "description":
            "number of reads (in millions); ideal 800M-1200M for human",
            "modify": lambda x: x / 1000000.0,
            "suffix": "M",
            "scale": "PuBu",
        }
        self.headers["Raw coverage"] = {
            "description": "raw coverage; ideal ~56",
            "suffix": "x",
            "scale": "PuBu",
            "hidden": True,
        }
        self.headers["Coverage"] = {
            "description":
            "effective read coverage; ideal ~42 for nominal 56x cov",
            "suffix": "x",
            "scale": "PuBu",
        }
        self.headers["% Dup"] = {
            "rid": "pct_Dup",
            "description": "fraction of reads that are duplicates",
            "suffix": "%",
            "scale": "OrRd",
        }
        self.headers["% R2 Q30"] = {
            "rid": "pct_R2_Q30",
            "description": "fraction of Q30 bases in read 2; ideal 75-85%",
            "suffix": "%",
            "scale": "OrRd",
        }
        self.headers["Insert size"] = {
            "description": "median insert size (in bases); ideal 0.35-0.40 Kb",
            "suffix": "b",
            "scale": "OrRd",
            "format": "{:,.0f}",
            "hidden": True,
        }
        self.headers["% proper"] = {
            "rid": "pct_proper",
            "description": "fraction of proper read pairs; ideal >= 75%",
            "suffix": "%",
            "scale": "OrRd",
            "hidden": True,
        }
        self.headers["BC usage"] = {
            "description": "fraction of barcodes used; between 0 and 1",
            "scale": "OrRd",
            "hidden": True,
        }
        self.headers["Est size"] = {
            "description": "estimated genome size",
            "modify": lambda x: x / 1000000.0,
            "suffix": "Mb",
            "scale": "YlGn",
            "hidden": True,
        }
        self.headers["% repeats"] = {
            "rid": "pct_repeats",
            "description": "Estimated repetitive fraction (of genome)",
            "scale": "YlGn",
            "suffix": "%",
            "hidden": True,
        }
        self.headers["% AT"] = {
            "rid": "pct_AT",
            "description": "high AT index (of genome)",
            "scale": "YlGn",
            "suffix": "%",
            "hidden": True,
        }
        self.headers["Het dist"] = {
            "description":
            "mean distance between heterozygous SNPs (in kilobases)",
            "modify": lambda x: x / 1000.0,
            "suffix": "Kb",
            "scale": "YlGn",
            "format": "{:,.0f}",
            "hidden": True,
        }
        self.headers["p10"] = {
            "description": "molecule count extending 10 kb on both sides",
            "scale": "BuGn",
            "hidden": True,
        }
        self.headers["% missing BC"] = {
            "rid": "pct_missing_BC",
            "description": "fraction of reads that are not barcoded",
            "suffix": "%",
            "scale": "BuGn",
        }
        self.headers["Barcode N50"] = {
            "description": "N50 reads per barcode (in bases)",
            "suffix": "b",
            "scale": "BuGn",
            "format": "{:,.0f}",
        }
        self.headers["% Phased"] = {
            "rid": "pct_Phased",
            "description": "nonduplicate and phased reads; ideal 45-50%",
            "suffix": "%",
            "scale": "BuGn",
            "hidden": True,
        }

        reports = OrderedDict()
        summaries = OrderedDict()
        molecules = OrderedDict()
        kmers = OrderedDict()
        root_summary = {}

        ### Parse the input log files
        # report.txt files
        for f in self.find_log_files("supernova/report"):
            log.debug("Found report in: {}".format(f["root"]))
            sid, data = self.parse_report(f["f"])
            s_name = self.clean_s_name(sid, f)
            if s_name in reports.keys():
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        s_name))
            reports[s_name] = data
            self.add_data_source(f, s_name=s_name, section="supernova-table")

        # summary.json files
        for f in self.find_log_files("supernova/summary"):
            log.debug("Found summary.json in: {}".format(f["root"]))
            try:
                sid, data = self.parse_summary(f["f"])
            except ValueError:
                log.debug("Error parsing JSON file in {}".format(f["root"]))
                continue
            except RuntimeError:
                log.debug("Could not find sample_id in JSON file in {}".format(
                    f["root"]))
                continue

            s_name = self.clean_s_name(sid, f)
            if s_name in summaries.keys():
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        s_name))
            summaries[s_name] = data
            self.add_data_source(f, s_name=s_name, section="supernova-table")
            # The plot json files do not contain sample IDs, sadly. So we need to store it somewhere.
            root_summary[f["root"]] = sid

        # histogram_molecules.json files
        for f in self.find_log_files("supernova/molecules"):
            log.debug("Found histogram_molecules.json in: {}".format(
                f["root"]))
            try:
                if f["root"] in root_summary.keys():
                    data = self.parse_histogram(f["f"])
                    sid = root_summary[f["root"]]
                    s_name = self.clean_s_name(sid, f)
                    molecules[s_name] = data
                    self.add_data_source(f,
                                         s_name=s_name,
                                         section="supernova-molecules")
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f["root"]))
                continue

        # histogram_kmer_count.json files
        for f in self.find_log_files("supernova/kmers"):
            log.debug("Found histogram_kmer_count.json in: {}".format(
                f["root"]))
            try:
                if f["root"] in root_summary.keys():
                    data = self.parse_histogram(f["f"], 400)
                    sid = root_summary[f["root"]]
                    s_name = self.clean_s_name(sid, f)
                    kmers[s_name] = data
                    self.add_data_source(f,
                                         s_name=s_name,
                                         section="supernova-kmers")
            except RuntimeError:
                log.debug("Could not parse JSON file in {}".format(f["root"]))
                continue

        # Data from summary.json supersedes data from report.txt
        for sample_id, sum_data in summaries.items():
            if sample_id in reports.keys():
                log.debug(
                    "Found summary data for sample {} which supersedes report data"
                    .format(sample_id))
                reports[sample_id] = sum_data
        # Ignore cmd-line specified samples
        reports = self.ignore_samples(reports)
        molecules = self.ignore_samples(molecules)
        kmers = self.ignore_samples(kmers)

        if len(reports) == 0:
            raise UserWarning
        else:
            log.info("Found {} reports".format(len(reports.keys())))

        ### Write the report
        self.write_data_file(reports, "multiqc_supernova")
        config_table = {"id": "supernova_table", "namespace": "supernova"}
        self.add_section(
            name="Assembly statistics",
            anchor="supernova-table",
            description=
            "Statistics gathered from the summary report(s) of Supernova. Note! "
            "There are more columns available but they are hidden by default.",
            helptext=
            "As a bare minimum these numbers are generated from the file report.txt, "
            "found in the folder `sampleID/outs/`. If available the stats in the report "
            "file will be superseded by the higher precision numbers found in the file "
            "`sampleID/outs/assembly/stats/summary.json`",
            plot=table.plot(reports, self.headers, config_table),
        )

        # N50 barcharts
        n50_cats = [
            {
                "Scaff N50": {
                    "name": "Scaffold N50",
                    "color": "#66c2a5"
                }
            },
            {
                "Contig N50": {
                    "name": "Contig N50",
                    "color": "#fc8d62"
                }
            },
            {
                "Edge N50": {
                    "name": "Edge N50",
                    "color": "#8da0cb"
                }
            },
            {
                "Phase N50": {
                    "name": "Phase block N50",
                    "color": "#e78ac3"
                }
            },
        ]
        config_n50 = {
            "id":
            "supernova_n50",
            "title":
            "Supernova: N50 statistics",
            "ylab":
            "Scaffold N50",
            "cpswitch":
            False,
            "data_labels":
            ["Scaffold N50", "Contig N50", "Edge N50", "Phase block N50"],
        }
        self.add_section(
            name="N50 statistics",
            anchor="supernova-n50",
            description=
            "Assembly N50 values - the shortest sequence length at 50% of the genome when sorted by size (see [wikipedia](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50)).",
            helptext=
            "Note that assembly size and N50 values are computed after removing scaffolds &le; 10 kb and do not count `N`s: \n\n"
            "* **Scaffold N50** - N50 size of scaffolds in bases, \n"
            "* **Contig N50** - N50 size of contigs in bases, \n"
            "* **Edge N50** - N50 size of raw graph assembly edges in bases, \n"
            "* **Phase block N50** - N50 size of phase blocks in bases. \n\n"
            "[(source)](https://support.10xgenomics.com/de-novo-assembly/software/pipelines/latest/output/asm-stats)",
            plot=bargraph.plot([reports, reports, reports, reports], n50_cats,
                               config_n50),
        )

        # Conditional sections
        if len(molecules) > 0:
            # Remove the long tail, or fail if this is a legacy empty json file
            try:
                max_x = self.trim_tail(molecules, 100000)
            except IndexError:
                log.debug(
                    "The histogram file is empty. Skipping molecule length section"
                )
                return
            # Add molecules plot
            config_molecules = {
                "id": "supernova_molecules",
                "title": "Supernova: Molecule Lengths",
                "xlab": "Inferred molecule length (bp)",
                "ylab": "# molecules",
                "smooth_points": 300,
                "smooth_points_sumcounts": True,
                "xmax": max_x,
            }
            self.add_section(
                name="Molecule Lengths",
                anchor="supernova-molecules",
                description=
                "Shows the inferred molecule lengths of the input 10X library.",
                helptext=
                "Inferred in the `patch` step of the Supernova pipeline. It is worth "
                "keeping in mind that the mean molecule length from the report is a length-weighted mean. "
                "See the [source code](https://github.com/10XGenomics/supernova/search?q=lw_mean_mol_len&type=) "
                "for how this value is calculated.",
                plot=linegraph.plot(molecules, config_molecules),
            )
        if len(kmers) > 0:
            # Remove the long tail, or fail if this is a legacy empty json file
            try:
                max_x = self.trim_tail(kmers, 50)
            except IndexError:
                log.debug(
                    "The histogram file is empty. Skipping kmers section")
                return
            # Add kmers plot
            config_kmers = {
                "id": "supernova_kmers",
                "title": "Supernova: Kmer Counts",
                "xlab": "Filtered kmer multiplicity",
                "ylab": "Counts",
                "smooth_points_sumcounts": False,
                "xmax": max_x,
            }
            self.add_section(
                name="K-mer counts",
                anchor="supernova-kmers",
                description=
                "Shows the k-mer frequencies of the input data to Supernova (after filtering).",
                helptext=
                "This data is generated from k-merizing the input read data, where the sequences are "
                "transformed in to the set of all possible sub-sequences of a fixed length of `K` (Supernova uses `K=48`). "
                "The plot shows on the x-axis the multiplicity (i.e. how many times are they repeated) of these k-mers "
                "and the y-axis the number of k-mers at this level of multiplicity. "
                "A careful reading of this plot can give some insights into the levels of heterozygosity and repeats "
                "in the genome that was sequenced and indications if the sequencing experiment was successful.",
                plot=linegraph.plot(kmers, config_kmers),
            )
Example #43
0
    def parse_bcftools_stats(self):
        """
        Find bcftools stats logs and parse their data
          Bcftools stats reports contain 'sets' of data, which can
          have multiple vcf files each (but usually don't). Here,
          we treat each 'set' as a MultiQC sample, taking the first
          input filename for each set as the name.
        """
        collapse_complementary = getattr(config, 'bcftools', {}).get('collapse_complementary_changes', False)
        if collapse_complementary:
            types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T']
        else:
            types = ['A>C', 'A>G', 'A>T', 'C>A', 'C>G', 'C>T',
                     'G>A', 'G>C', 'G>T', 'T>A', 'T>C', 'T>G']

        self.bcftools_stats = dict()
        self.bcftools_stats_indels = dict()
        self.bcftools_stats_vqc_snp = dict()
        self.bcftools_stats_vqc_transi = dict()
        self.bcftools_stats_vqc_transv = dict()
        self.bcftools_stats_vqc_indels = dict()
        depth_data = dict()
        for f in self.find_log_files('bcftools/stats'):
            s_names = list()
            for line in f['f'].splitlines():
                s = line.split("\t")
                # Get the sample names - one per 'set'
                if s[0] == "ID":
                    s_name = self.clean_s_name(s[2], f['root'])
                    s_names.append(s_name)
                    if s_name in self.bcftools_stats:
                        log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
                    self.add_data_source(f, s_name, section='stats')
                    self.bcftools_stats[s_name] = dict()
                    self.bcftools_stats_indels[s_name] = dict()
                    self.bcftools_stats_vqc_snp[s_name] = dict()
                    self.bcftools_stats_vqc_transi[s_name] = dict()
                    self.bcftools_stats_vqc_transv[s_name] = dict()
                    self.bcftools_stats_vqc_indels[s_name] = dict()
                    depth_data[s_name] = OrderedDict()
                    self.bcftools_stats_indels[s_name][0] = None # Avoid joining line across missing 0

                # Parse key stats
                if s[0] == "SN" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    field = s[2].strip()[:-1]
                    field = field.replace(' ', '_')
                    value = float(s[3].strip())
                    self.bcftools_stats[s_name][field] = value

                # Parse transitions/transversions stats
                if s[0] == "TSTV" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = ['ts', 'tv', 'tstv', 'ts_1st_ALT', 'tv_1st_ALT', 'tstv_1st_ALT']
                    for i, f in enumerate(fields):
                        value = float(s[i+2].strip())

                        self.bcftools_stats[s_name][f] = value

                # Parse substitution types
                if s[0] == "ST" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]

                    rc = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                    change = s[2].strip()
                    if change not in types:
                        change = '>'.join(rc[n] for n in change.split('>'))

                    field = 'substitution_type_{}'.format(change)
                    value = float(s[3].strip())
                    if field not in self.bcftools_stats[s_name]:
                        self.bcftools_stats[s_name][field] = 0
                    self.bcftools_stats[s_name][field] += value

                # Indel length distributions
                if s[0] == "IDD" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    length = float(s[2].strip())
                    count = float(s[3].strip())
                    self.bcftools_stats_indels[s_name][length] = count

                # Per-sample counts
                if s[0] == "PSC" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    fields = ['variations_hom', 'variations_het']
                    for i, f in enumerate(fields):
                        self.bcftools_stats[s_name][f] = int(s[i + 4].strip())

                # Depth plots
                if s[0] == "DP" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    bin_name = s[2].strip()
                    percent_sites = float(s[-1].strip())
                    depth_data[s_name][bin_name] = percent_sites

                # Variant Qualities
                if s[0] == "QUAL" and len(s_names) > 0:
                    s_name = s_names[int(s[1])]
                    quality = float(s[2].strip())
                    self.bcftools_stats_vqc_snp[s_name][quality] = float(s[3].strip())
                    self.bcftools_stats_vqc_transi[s_name][quality] = float(s[4].strip())
                    self.bcftools_stats_vqc_transv[s_name][quality] = float(s[5].strip())
                    self.bcftools_stats_vqc_indels[s_name][quality] = float(s[6].strip())

        # Filter to strip out ignored sample names
        self.bcftools_stats = self.ignore_samples(self.bcftools_stats)

        if len(self.bcftools_stats) > 0:

            # Write parsed report data to a file
            self.write_data_file(self.bcftools_stats, 'multiqc_bcftools_stats')

            # Stats Table
            stats_headers = self.bcftools_stats_genstats_headers()
            if getattr(config, 'bcftools', {}).get('write_general_stats', True):
                self.general_stats_addcols(self.bcftools_stats, stats_headers, 'Bcftools Stats')
            if getattr(config, 'bcftools', {}).get('write_separate_table', False):
                self.add_section(
                    name='Bcftools Stats',
                    anchor='bcftools-stats',
                    plot=table.plot(self.bcftools_stats, stats_headers))

            # Make bargraph plot of substitution types
            keys = OrderedDict()
            for t in types:
                keys['substitution_type_{}'.format(t)] = {'name': t}
            pconfig = {
                'id': 'bcftools-stats-subtypes',
                'title': 'Bcftools Stats: Substitutions',
                'ylab': '# Substitutions',
                'cpswitch_counts_label': 'Number of Substitutions'
            }
            self.add_section (
                name = 'Variant Substitution Types',
                anchor = 'bcftools-stats',
                plot = bargraph.plot(self.bcftools_stats, keys, pconfig)
            )

            # Make histograms of variant quality
            if len(self.bcftools_stats_vqc_snp) > 0:
                pconfig = {
                    'id': 'bcftools_stats_vqc',
                    'title': 'Bcftools Stats: Variant Quality Count',
                    'ylab': 'Count',
                    'xlab': 'Quality',
                    'xDecimals': False,
                    'ymin': 0,
                    'smooth_points': 600,
                    # 'tt_label': '<b>{point.x} bp trimmed</b>: {point.y:.0f}',
                    'data_labels': [
                        {'name': 'Count SNP', 'ylab': 'Quality'},
                        {'name': 'Count Transitions', 'ylab': 'Quality'},
                        {'name': 'Count Transversions', 'ylab': 'Quality'},
                        {'name': 'Count Indels', 'ylab': 'Quality'}
                    ]
                }
                self.add_section (
                    name = 'Variant Quality',
                    anchor = 'bcftools-stats_variant_quality_plot',
                    plot = linegraph.plot (
                        [self.bcftools_stats_vqc_snp,
                        self.bcftools_stats_vqc_transi,
                        self.bcftools_stats_vqc_transv,
                        self.bcftools_stats_vqc_indels], pconfig)
                )

            # Make line graph of indel lengths
            if len(self.bcftools_stats_indels) > 0:
                pconfig = {
                    'id': 'bcftools_stats_indel-lengths',
                    'title': 'Bcftools Stats: Indel Distribution',
                    'ylab': 'Count',
                    'xlab': 'InDel Length (bp)',
                    'xDecimals': False,
                    'ymin': 0,
                }
                self.add_section (
                    name = 'Indel Distribution',
                    anchor = 'bcftools-stats_indel_plot',
                    plot = linegraph.plot(self.bcftools_stats_indels, pconfig)
                )
            # Make line graph of variants per depth
            if len(depth_data) > 0:
                pconfig = {
                    'id': 'bcftools_stats_depth',
                    'title': 'Bcftools Stats: Variant depths',
                    'ylab': 'Fraction of sites (%)',
                    'xlab': 'Variant depth',
                    'ymin': 0,
                    'ymax': 100,
                    'categories': True
                }
                self.add_section (
                    name = 'Variant depths',
                    anchor = 'bcftools-stats_depth_plot',
                    description = 'Read depth support distribution for called variants',
                    plot = linegraph.plot(depth_data, pconfig)
                )

        # Return the number of logs that were found
        return len(self.bcftools_stats)
Example #44
0
    def overrepresented_sequences(self):
        """Sum the percentages of overrepresented sequences and display them in a bar plot"""

        data = dict()
        for s_name in self.fastqc_data:
            data[s_name] = dict()
            try:
                max_pcnt = max([
                    float(d['percentage']) for d in self.fastqc_data[s_name]
                    ['overrepresented_sequences']
                ])
                total_pcnt = sum([
                    float(d['percentage']) for d in self.fastqc_data[s_name]
                    ['overrepresented_sequences']
                ])
                data[s_name]['total_overrepresented'] = total_pcnt
                data[s_name]['top_overrepresented'] = max_pcnt
                data[s_name][
                    'remaining_overrepresented'] = total_pcnt - max_pcnt
            except KeyError:
                if self.fastqc_data[s_name]['statuses'][
                        'overrepresented_sequences'] == 'pass':
                    data[s_name]['total_overrepresented'] = 0
                    data[s_name]['top_overrepresented'] = 0
                    data[s_name]['remaining_overrepresented'] = 0
                else:
                    log.debug("Couldn't find data for {}, invalid Key".format(
                        s_name))

        cats = OrderedDict()
        cats['top_overrepresented'] = {'name': 'Top over-represented sequence'}
        cats['remaining_overrepresented'] = {
            'name': 'Sum of remaining over-represented sequences'
        }

        # Config for the plot
        pconfig = {
            'id': 'fastqc_overrepresented_sequencesi_plot',
            'title': 'FastQC: Overrepresented sequences',
            'ymin': 0,
            'yCeiling': 100,
            'yMinRange': 20,
            'tt_decimals': 2,
            'tt_suffix': '%',
            'tt_percentages': False,
            'ylab_format': '{value}%',
            'cpswitch': False,
            'ylab': 'Percentage of Total Sequences'
        }

        # Check if any samples have more than 1% overrepresented sequences, else don't make plot.
        if max([x['total_overrepresented'] for x in data.values()]) < 1:
            plot_html = '<div class="alert alert-info">{} samples had less than 1% of reads made up of overrepresented sequences</div>'.format(
                len(data))
        else:
            plot_html = bargraph.plot(data, cats, pconfig)

        self.add_section(
            name='Overrepresented sequences',
            anchor='fastqc_overrepresented_sequences',
            description=
            'The total amount of overrepresented sequences found in each library. '
            +
            'See the <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/9%20Overrepresented%20Sequences.html" target="_bkank">FastQC help for further information</a>.',
            plot=plot_html)
Example #45
0
    def parse_samtools_idxstats(self):
        """ Find Samtools idxstats logs and parse their data """

        self.samtools_idxstats = dict()
        for f in self.find_log_files('samtools/idxstats'):
            parsed_data = parse_single_report(f['f'])
            if len(parsed_data) > 0:
                if f['s_name'] in self.samtools_idxstats:
                    log.debug(
                        "Duplicate sample name found! Overwriting: {}".format(
                            f['s_name']))
                self.add_data_source(f, section='idxstats')
                self.samtools_idxstats[f['s_name']] = parsed_data

        # Filter to strip out ignored sample names
        self.samtools_idxstats = self.ignore_samples(self.samtools_idxstats)

        if len(self.samtools_idxstats) > 0:

            # Write parsed report data to a file (restructure first)
            self.write_data_file(self.samtools_idxstats,
                                 'multiqc_samtools_idxstats')

            # Prep the data for the plots
            keys = list()
            pdata = dict()
            pdata_norm = dict()
            xy_counts = dict()
            # Count the total mapped reads for every chromosome
            chrs_mapped = defaultdict(lambda: 0)
            sample_mapped = defaultdict(lambda: 0)
            total_mapped = 0
            # Cutoff, can be customised in config
            cutoff = float(
                getattr(config, 'samtools_idxstats_fraction_cutoff', 0.001))
            if cutoff != 0.001:
                log.info('Setting idxstats cutoff to: {}%'.format(cutoff *
                                                                  100.0))
            for s_name in self.samtools_idxstats:
                for chrom in self.samtools_idxstats[s_name]:
                    chrs_mapped[chrom] += self.samtools_idxstats[s_name][chrom]
                    sample_mapped[s_name] += self.samtools_idxstats[s_name][
                        chrom]
                    total_mapped += self.samtools_idxstats[s_name][chrom]
            req_reads = float(total_mapped) * cutoff
            chr_always = getattr(config, 'samtools_idxstats_always', [])
            if len(chr_always) > 0:
                log.info('Trying to include these chromosomes in idxstats: {}'.
                         format(', '.join(chr_always)))
            chr_ignore = getattr(config, 'samtools_idxstats_ignore', [])
            if len(chr_ignore) > 0:
                log.info(
                    'Excluding these chromosomes from idxstats: {}'.format(
                        ', '.join(chr_ignore)))
            xchr = getattr(config, 'samtools_idxstats_xchr', False)
            if xchr:
                log.info('Using "{}" as X chromosome name'.format(xchr))
            ychr = getattr(config, 'samtools_idxstats_ychr', False)
            if ychr:
                log.info('Using "{}" as Y chromosome name'.format(ychr))
            # Go through again and collect all of the keys that have enough counts
            # Also get the X/Y counts if we find them
            for s_name in self.samtools_idxstats:
                x_count = False
                y_count = False
                for chrom in self.samtools_idxstats[s_name]:
                    if float(chrs_mapped[chrom]
                             ) > req_reads or chrom in chr_always:
                        if chrom not in chr_ignore and chrom not in keys:
                            keys.append(chrom)
                    # Collect X and Y counts if we have them
                    mapped = self.samtools_idxstats[s_name][chrom]
                    if xchr is not False:
                        if str(xchr) == str(chrom):
                            x_count = mapped
                    else:
                        if chrom.lower() == 'x' or chrom.lower() == 'chrx':
                            x_count = mapped
                    if ychr is not False:
                        if str(ychr) == str(chrom):
                            y_count = mapped
                    else:
                        if chrom.lower() == 'y' or chrom.lower() == 'chry':
                            y_count = mapped
                # Only save these counts if we have both x and y
                if x_count and y_count:
                    xy_counts[s_name] = {'x': x_count, 'y': y_count}
            # Ok, one last time. We have the chromosomes that we want to plot,
            # now collect the counts
            for s_name in self.samtools_idxstats:
                pdata[s_name] = OrderedDict()
                pdata_norm[s_name] = OrderedDict()
                for k in keys:
                    try:
                        pdata[s_name][k] = self.samtools_idxstats[s_name][k]
                        pdata_norm[s_name][k] = float(
                            self.samtools_idxstats[s_name]
                            [k]) / sample_mapped[s_name]
                    except (KeyError, ZeroDivisionError):
                        pdata[s_name][k] = 0
                        pdata_norm[s_name][k] = 0

            # X/Y ratio plot
            if len(xy_counts) > 0:
                xy_keys = OrderedDict()
                xy_keys['x'] = {'name': xchr if xchr else 'Chromosome X'}
                xy_keys['y'] = {'name': ychr if ychr else 'Chromosome Y'}
                pconfig = {
                    'id': 'samtools-idxstats-xy-plot',
                    'title': 'Samtools idxstats: chrXY mapped reads',
                    'ylab': 'Percent of X+Y Reads',
                    'cpswitch_counts_label': 'Number of Reads',
                    'cpswitch_percent_label': 'Percent of X+Y Reads',
                    'cpswitch_c_active': False
                }
                self.add_section(name='XY counts',
                                 anchor='samtools-idxstats-xy-counts',
                                 plot=bargraph.plot(xy_counts, xy_keys,
                                                    pconfig))

            # Mapped reads per chr line plot
            pconfig = {
                'id':
                'samtools-idxstats-mapped-reads-plot',
                'title':
                'Samtools idxstats: Mapped reads per contig',
                'ylab':
                '# mapped reads',
                'xlab':
                'Chromosome Name',
                'categories':
                True,
                'tt_label':
                '<strong>{point.category}:</strong> {point.y:.2f}',
                'data_labels': [{
                    'name': 'Normalised Counts',
                    'ylab': 'Fraction of total count'
                }, {
                    'name': 'Counts',
                    'ylab': '# mapped reads'
                }]
            }
            self.add_section(
                name='Mapped reads per contig',
                anchor='samtools-idxstats',
                description=
                'The <code>samtools idxstats</code> tool counts the number of mapped reads per chromosome / contig. '
                +
                'Chromosomes with &lt; {}% of the total aligned reads are omitted from this plot.'
                .format(cutoff * 100),
                plot=linegraph.plot([pdata_norm, pdata], pconfig))

        # Return the number of logs that were found
        return len(self.samtools_idxstats)
Example #46
0
    def __init__(self, c_id, mod):

        modname = mod['config'].get('section_name',
                                    c_id.replace('_', ' ').title())
        if modname == '' or modname is None:
            modname = 'Custom Content'

        # Initialise the parent object
        super(MultiqcModule,
              self).__init__(name=modname,
                             anchor=mod['config'].get('section_anchor', c_id),
                             href=mod['config'].get('section_href'),
                             info=mod['config'].get('description'))

        pconfig = mod['config'].get('pconfig', {})
        if pconfig.get('title') is None:
            pconfig['title'] = modname

        # Table
        if mod['config'].get('plot_type') == 'table':
            pconfig['sortRows'] = pconfig.get('sortRows', False)
            headers = mod['config'].get('headers')
            self.add_section(plot=table.plot(mod['data'], headers, pconfig))
            self.write_data_file(
                mod['data'],
                "multiqc_{}".format(modname.lower().replace(' ', '_')))

        # Bar plot
        elif mod['config'].get('plot_type') == 'bargraph':
            self.add_section(plot=bargraph.plot(
                mod['data'], mod['config'].get('categories'), pconfig))

        # Line plot
        elif mod['config'].get('plot_type') == 'linegraph':
            self.add_section(plot=linegraph.plot(mod['data'], pconfig))

        # Scatter plot
        elif mod['config'].get('plot_type') == 'scatter':
            self.add_section(plot=scatter.plot(mod['data'], pconfig))

        # Heatmap
        elif mod['config'].get('plot_type') == 'heatmap':
            self.add_section(plot=heatmap.plot(mod['data'], mod['config'].get(
                'xcats'), mod['config'].get('ycats'), pconfig))

        # Beeswarm plot
        elif mod['config'].get('plot_type') == 'beeswarm':
            self.add_section(plot=beeswarm.plot(mod['data'], pconfig))

        # Raw HTML
        elif mod['config'].get('plot_type') == 'html':
            self.add_section(content=mod['data'])

        # Raw image file as html
        elif mod['config'].get('plot_type') == 'image':
            self.add_section(content=mod['data'])

        # Not supplied
        elif mod['config'].get('plot_type') == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning(
                "Error - custom content plot type '{}' not recognised for content ID {}"
                .format(mod['config'].get('plot_type'), c_id))
def parse_reports(self):
    """ Find Picard TargetedPcrMetrics reports and parse their data """

    # Set up vars
    self.picard_pcrmetrics_data = dict()
    self.picard_pcrmetrics_samplestats = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('picard/pcr_metrics', filehandles=True):
        s_name = None
        for l in f['f']:
            # New log starting
            if 'TargetedPcrMetrics' in l and 'INPUT' in l:
                s_name = None
                # Pull sample name from input
                fn_search = re.search(r"INPUT(?:=|\s+)(\[?[^\s]+\]?)",
                                      l,
                                      flags=re.IGNORECASE)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1).strip('[]'))
                    s_name = self.clean_s_name(s_name, f['root'])

            if s_name is not None:
                if 'TargetedPcrMetrics' in l and '## METRICS CLASS' in l:
                    keys = f['f'].readline().strip("\n").split("\t")
                    vals = f['f'].readline().strip("\n").split("\t")
                    if len(vals) == len(keys):
                        if s_name in self.picard_pcrmetrics_data:
                            log.debug(
                                "Duplicate sample name found in {}! Overwriting: {}"
                                .format(f['fn'], s_name))
                        self.add_data_source(f,
                                             s_name,
                                             section='TargetedPcrMetrics')
                        self.picard_pcrmetrics_data[s_name] = dict()
                        for i, k in enumerate(keys):
                            try:
                                # Multiply percentages by 100
                                if k.startswith('PCT_'):
                                    vals[i] = float(vals[i]) * 100.0
                                self.picard_pcrmetrics_data[s_name][k] = float(
                                    vals[i])
                            except ValueError:
                                self.picard_pcrmetrics_data[s_name][k] = vals[
                                    i]

    # Filter to strip out ignored sample names
    self.picard_pcrmetrics_data = self.ignore_samples(
        self.picard_pcrmetrics_data)

    if len(self.picard_pcrmetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_pcrmetrics_data,
                             'multiqc_picard_pcrmetrics')

        # Add to general stats table
        self.general_stats_headers['PCT_AMPLIFIED_BASES'] = {
            'title': '% Amplified Bases',
            'description':
            'The fraction of aligned bases that mapped to or near an amplicon.',
            'min': 0,
            'max': 100,
            'suffix': '%',
            'scale': 'BrBG'
        }
        self.general_stats_headers['MEDIAN_TARGET_COVERAGE'] = {
            'title': 'Median Target Coverage',
            'description':
            'The median coverage of reads that mapped to target regions of an experiment.',
            'min': 0,
            'suffix': 'X',
            'scale': 'GnBu',
        }

        for s_name in self.picard_pcrmetrics_data:
            if s_name not in self.general_stats_data:
                self.general_stats_data[s_name] = dict()
            self.general_stats_data[s_name].update(
                self.picard_pcrmetrics_data[s_name])

        # Bar plot of ignored bases
        keys = OrderedDict()
        keys['ON_AMPLICON_BASES'] = {'name': 'On-amplicon bases'}
        keys['NEAR_AMPLICON_BASES'] = {'name': 'Near-amplicon bases'}
        keys['OFF_AMPLICON_BASES'] = {
            'name': 'Off-amplicon bases',
            'color': '#f28f43'
        }

        # Config for the plot
        pconfig = {
            'id': 'picard_pcr_metrics_bases',
            'title': 'Picard: PCR Amplicon Bases',
            'ylab': '# Bases',
            'cpswitch_counts_label': '# Bases',
            'hide_zero_cats': False
        }

        self.add_section(
            name='PCR Amplicon Bases',
            anchor='picard-pcrmetrics-bases',
            description=
            'Metrics about reads obtained from targeted PCR experiments.',
            helptext='''
            This plot shows the number of bases aligned on or near to amplified regions of the genome.

            * `ON_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped to an amplified region of the genome.
            * `NEAR_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped to within a fixed interval of an amplified region, but not on a baited region.
            * `OFF_AMPLICON_BASES`: The number of `PF_BASES_ALIGNED` that mapped neither on or near an amplicon.

            For more information see the [Picard documentation](https://broadinstitute.github.io/picard/picard-metric-definitions.html#TargetedPcrMetrics).''',
            plot=bargraph.plot(self.picard_pcrmetrics_data, keys, pconfig))

    # Return the number of detected samples to the parent module
    return len(self.picard_pcrmetrics_data)
Example #48
0
    def __init__(self):

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name="mosdepth",
            anchor="mosdepth",
            href="https://github.com/brentp/mosdepth",
            info=
            "performs fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing",
        )

        dist_data, cov_data, xmax, perchrom_avg_data = self.parse_cov_dist()

        # Filter out any samples from --ignore-samples
        dist_data = self.ignore_samples(dist_data)
        cov_data = self.ignore_samples(cov_data)
        perchrom_avg_data = self.ignore_samples(perchrom_avg_data)

        # No samples found
        num_samples = max(len(dist_data), len(cov_data),
                          len(perchrom_avg_data))
        if num_samples == 0:
            raise UserWarning
        log.info("Found {} reports".format(num_samples))

        if dist_data:
            self.add_section(
                name="Coverage distribution",
                anchor="mosdepth-coverage-dist",
                description=
                "Distribution of the number of locations in the reference genome with a given depth of coverage",
                helptext=genome_fraction_helptext,
                plot=linegraph.plot(
                    dist_data,
                    {
                        "id": "mosdepth-coverage-dist-id",
                        "title": "Mosdepth: Coverage Distribution",
                        "xlab": "Coverage (X)",
                        "ylab":
                        "% bases in genome/regions covered by at least X reads",
                        "ymax": 100,
                        "xmax": xmax,
                        "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%",
                        "smooth_points": 500,
                    },
                ),
            )
        if cov_data:
            self.add_section(
                name="Coverage plot",
                anchor="mosdepth-coverage-cov",
                description=
                "Number of locations in the reference genome with a given depth of coverage",
                helptext=coverage_histogram_helptext,
                plot=linegraph.plot(
                    cov_data,
                    {
                        "id": "mosdepth-coverage-plot-id",
                        "title": "Mosdepth: Coverage Depth",
                        "xlab": "Coverage (X)",
                        "ylab": "% bases in genome/regions covered at X reads",
                        "ymax": 100,
                        "xmax": xmax,
                        "tt_label": "<b>{point.x}X</b>: {point.y:.2f}%",
                        "smooth_points": 500,
                    },
                ),
            )
        if perchrom_avg_data:
            num_contigs = max(
                [len(x.keys()) for x in perchrom_avg_data.values()])
            if num_contigs > 1:
                perchrom_plot = linegraph.plot(
                    perchrom_avg_data,
                    {
                        "id": "mosdepth-coverage-per-contig",
                        "title": "Mosdepth: Coverage per contig",
                        "xlab": "region",
                        "ylab": "average coverage",
                        "categories": True,
                        "tt_decimals": 1,
                        "tt_suffix": "x",
                        "smooth_points": 500,
                    },
                )
            else:
                perchrom_plot = bargraph.plot(
                    perchrom_avg_data,
                    pconfig={
                        "id": "mosdepth-coverage-per-contig",
                        "title": "Mosdepth: Coverage per contig",
                        "xlab": "Sample",
                        "ylab": "Average coverage",
                        "tt_suffix": "x",
                    },
                )

            self.add_section(
                name="Average coverage per contig",
                anchor="mosdepth-coverage-per-contig-id",
                description="Average coverage per contig or chromosome",
                plot=perchrom_plot,
            )
        if dist_data:
            threshs, hidden_threshs = get_cov_thresholds()
            self.genstats_cov_thresholds(dist_data, threshs, hidden_threshs)
            self.genstats_mediancov(dist_data)
Example #49
0
    def parse_samtools_rmdup(self):
        """Find Samtools rmdup logs and parse their data"""

        self.samtools_rmdup = dict()
        for f in self.find_log_files("samtools/rmdup", filehandles=True):
            # Example below:
            # [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library '   '
            dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
            s_name = f["s_name"]
            for l in f["f"]:
                match = re.search(dups_regex, l)
                if match:
                    library_name = match.group(4).strip()
                    if library_name != "":
                        s_name = library_name
                    if s_name in self.samtools_rmdup:
                        log.debug(
                            "Duplicate sample name found in {}! Overwriting: {}"
                            .format(f["fn"], s_name))
                    self.add_data_source(f, s_name)
                    self.samtools_rmdup[s_name] = dict()
                    self.samtools_rmdup[s_name]["n_dups"] = int(match.group(1))
                    self.samtools_rmdup[s_name]["n_tot"] = int(match.group(2))
                    self.samtools_rmdup[s_name]["n_unique"] = int(
                        match.group(2)) - int(match.group(1))
                    self.samtools_rmdup[s_name]["pct_dups"] = float(
                        match.group(3)) * 100

        # Filter to strip out ignored sample names
        self.samtools_rmdup = self.ignore_samples(self.samtools_rmdup)

        if len(self.samtools_rmdup) > 0:
            # Write parsed report data to a file
            self.write_data_file(self.samtools_rmdup, "multiqc_samtools_rmdup")

            # Make a bar plot showing duplicates
            keys = OrderedDict()
            keys["n_unique"] = {"name": "Non-duplicated reads"}
            keys["n_dups"] = {"name": "Duplicated reads"}
            pconfig = {
                "id": "samtools_rmdup_plot",
                "title": "Samtools rmdup: Duplicate alignments",
                "ylab": "Number of reads",
                "yDecimals": False,
            }
            self.add_section(
                name="Duplicates removed",
                anchor="samtools-rmdup",
                plot=bargraph.plot(self.samtools_rmdup, keys, pconfig),
            )

            # Add a column to the General Stats table
            # General Stats Table
            stats_headers = OrderedDict()
            stats_headers["pct_dups"] = {
                "title": "% Dups",
                "description": "Percent of duplicate alignments",
                "min": 0,
                "max": 100,
                "suffix": "%",
                "scale": "OrRd",
            }
            self.general_stats_addcols(self.samtools_rmdup, stats_headers)

        return len(self.samtools_rmdup)
Example #50
0
    def add_cc_section(self, c_id, mod):

        section_name = mod["config"].get("section_name",
                                         c_id.replace("_", " ").title())
        if section_name == "" or section_name is None:
            section_name = "Custom Content"

        section_description = mod["config"].get("description", "")

        pconfig = mod["config"].get("pconfig", {})
        if pconfig.get("id") is None:
            pconfig["id"] = f"{c_id}-plot"
        if pconfig.get("title") is None:
            pconfig["title"] = section_name

        plot = None
        content = None

        # Save the data if it's not a html string
        if not isinstance(mod["data"], str):
            self.write_data_file(mod["data"],
                                 "multiqc_{}".format(pconfig["id"]))
            pconfig["save_data_file"] = False

        # Table
        if mod["config"].get("plot_type") == "table":
            pconfig["sortRows"] = pconfig.get("sortRows", False)
            headers = mod["config"].get("headers")
            plot = table.plot(mod["data"], headers, pconfig)

        # Bar plot
        elif mod["config"].get("plot_type") == "bargraph":
            mod["data"] = {k: v for k, v in sorted(mod["data"].items())}
            plot = bargraph.plot(mod["data"], mod["config"].get("categories"),
                                 pconfig)

        # Line plot
        elif mod["config"].get("plot_type") == "linegraph":
            plot = linegraph.plot(mod["data"], pconfig)

        # Scatter plot
        elif mod["config"].get("plot_type") == "scatter":
            plot = scatter.plot(mod["data"], pconfig)

        # Heatmap
        elif mod["config"].get("plot_type") == "heatmap":
            plot = heatmap.plot(mod["data"], mod["config"].get("xcats"),
                                mod["config"].get("ycats"), pconfig)

        # Beeswarm plot
        elif mod["config"].get("plot_type") == "beeswarm":
            plot = beeswarm.plot(mod["data"], pconfig)

        # Raw HTML
        elif mod["config"].get("plot_type") == "html":
            content = mod["data"]

        # Raw image file as html
        elif mod["config"].get("plot_type") == "image":
            content = mod["data"]

        # Not supplied
        elif mod["config"].get("plot_type") == None:
            log.warning("Plot type not found for content ID '{}'".format(c_id))

        # Not recognised
        else:
            log.warning(
                "Error - custom content plot type '{}' not recognised for content ID {}"
                .format(mod["config"].get("plot_type"), c_id))

        # Don't use exactly the same title / description text as the main module
        if section_name == self.name:
            section_name = None
        if section_description == self.info:
            section_description = ""

        self.add_section(name=section_name,
                         anchor=c_id,
                         description=section_description,
                         plot=plot,
                         content=content)
Example #51
0
 def __map_pair_dup_read_chart(self, data_by_sample):
     chart_data = dict()
     for sample_id, data in data_by_sample.items():
         if (data["Not properly paired reads (discordant)"] +
                 data["Properly paired reads"] +
                 data["Singleton reads (itself mapped; mate unmapped)"] +
                 data["Unmapped reads"] != data["Total reads in RG"]):
             log.warning(
                 "sum of unpaired/discordant/proppaired/unmapped reads not matching total, "
                 "skipping mapping/paired percentages plot for: {}".format(
                     sample_id))
             continue
         if (data[
                 "Number of unique & mapped reads (excl. duplicate marked reads)"]
                 + data["Number of duplicate marked reads"] +
                 data["Unmapped reads"] != data["Total reads in RG"]):
             log.warning(
                 "sum of unique/duplicate/unmapped reads not matching total, "
                 "skipping mapping/duplicates percentages plot for: {}".
                 format(sample_id))
             continue
         chart_data[sample_id] = data
     self.add_section(
         name="Mapped / paired / duplicated",
         anchor="dragen-mapped-paired-duplicated",
         description=
         "Distribution of reads based on pairing, duplication and mapping.",
         plot=bargraph.plot(
             [chart_data, chart_data],
             [
                 {
                     "Number of unique & mapped reads (excl. duplicate marked reads)":
                     {
                         "color": "#437bb1",
                         "name": "Unique",
                     },
                     "Number of duplicate marked reads": {
                         "color": "#f5a742",
                         "name": "Duplicated"
                     },
                     "Unmapped reads": {
                         "color": "#b1084c",
                         "name": "Unmapped"
                     },
                 },
                 {
                     "Properly paired reads": {
                         "color": "#099109",
                         "name": "Paired, properly"
                     },
                     "Not properly paired reads (discordant)": {
                         "color": "#c27a0e",
                         "name": "Paired, discordant"
                     },
                     "Singleton reads (itself mapped; mate unmapped)": {
                         "color": "#912476",
                         "name": "Singleton"
                     },
                     "Unmapped reads": {
                         "color": "#b1084c",
                         "name": "Unmapped"
                     },
                 },
             ],
             {
                 "id":
                 "mapping_dup_percentage_plot",
                 "title":
                 "Dragen: Mapped/paired/duplicated reads per read group",
                 "ylab":
                 "Reads",
                 "cpswitch_counts_label":
                 "Reads",
                 "data_labels": [
                     {
                         "name": "Unique vs duplicated vs unmapped",
                         "ylab": "Reads",
                         "cpswitch_counts_label": "Reads",
                     },
                     {
                         "name": "Paired vs. discordant vs. singleton",
                         "ylab": "Reads",
                         "cpswitch_counts_label": "Reads",
                     },
                 ],
             },
         ),
     )
Example #52
0
def parse_reports(self):
    """ Find Picard RnaSeqMetrics reports and parse their data """

    # Set up vars
    self.picard_RnaSeqMetrics_data = dict()
    self.picard_RnaSeqMetrics_histogram = dict()

    # Go through logs and find Metrics
    for f in self.find_log_files('picard/rnaseqmetrics', filehandles=True):
        s_name = None
        in_hist = False
        for l in f['f']:
            # Catch the histogram values
            if s_name is not None and in_hist is True:
                try:
                    sections = l.split("\t")
                    pos = int(sections[0])
                    coverage = float(sections[1])
                    self.picard_RnaSeqMetrics_histogram[s_name][pos] = coverage
                except ValueError:
                    # Reset in case we have more in this log file
                    s_name = None
                    in_hist = False

            # New log starting
            if 'rnaseqmetrics' in l.lower() and 'INPUT' in l:
                s_name = None
                # Pull sample name from input
                fn_search = re.search("INPUT=\[?([^\\s]+)\]?", l)
                if fn_search:
                    s_name = os.path.basename(fn_search.group(1))
                    s_name = self.clean_s_name(s_name, f['root'])

            if s_name is not None:
                if 'rnaseqmetrics' in l.lower() and '## METRICS CLASS' in l:
                    if s_name in self.picard_RnaSeqMetrics_data:
                        log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f['fn'], s_name))
                    self.picard_RnaSeqMetrics_data[s_name] = dict()
                    self.picard_RnaSeqMetrics_histogram[s_name] = dict()
                    self.add_data_source(f, s_name, section='RnaSeqMetrics')
                    keys = f['f'].readline().strip("\n").split("\t")
                    vals = f['f'].readline().strip("\n").split("\t")
                    for i, k in enumerate(keys):
                        # Multiply percentages by 100
                        if k.startswith('PCT_'):
                            try:
                                vals[i] = float(vals[i]) * 100.0
                            except (ValueError, IndexError):
                                pass
                        # Save the key:value pairs
                        try:
                            self.picard_RnaSeqMetrics_data[s_name][k] = float(vals[i])
                        except ValueError:
                            self.picard_RnaSeqMetrics_data[s_name][k] = vals[i]
                        except IndexError:
                            pass # missing data
                    # Calculate some extra numbers
                    if 'PF_BASES' in keys and 'PF_ALIGNED_BASES' in keys:
                        self.picard_RnaSeqMetrics_data[s_name]['PF_NOT_ALIGNED_BASES'] = \
                            self.picard_RnaSeqMetrics_data[s_name]['PF_BASES'] - self.picard_RnaSeqMetrics_data[s_name]['PF_ALIGNED_BASES']

            if s_name is not None and 'normalized_position	All_Reads.normalized_coverage' in l:
                self.picard_RnaSeqMetrics_histogram[s_name] = dict()
                in_hist = True

        for key in list(self.picard_RnaSeqMetrics_data.keys()):
            if len(self.picard_RnaSeqMetrics_data[key]) == 0:
                self.picard_RnaSeqMetrics_data.pop(key, None)
        for s_name in list(self.picard_RnaSeqMetrics_histogram.keys()):
            if len(self.picard_RnaSeqMetrics_histogram[s_name]) == 0:
                self.picard_RnaSeqMetrics_histogram.pop(s_name, None)
                log.debug("Ignoring '{}' histogram as no data parsed".format(s_name))

    # Filter to strip out ignored sample names
    self.picard_RnaSeqMetrics_data = self.ignore_samples(self.picard_RnaSeqMetrics_data)

    if len(self.picard_RnaSeqMetrics_data) > 0:

        # Write parsed data to a file
        self.write_data_file(self.picard_RnaSeqMetrics_data, 'multiqc_picard_RnaSeqMetrics')

        # Add to general stats table
        GenStatsHeaders = OrderedDict()
        GenStatsHeaders['PCT_RIBOSOMAL_BASES'] = {
            'title': '% rRNA',
            'description': 'Percent of aligned bases overlapping ribosomal RNA regions',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'Reds',
        }
        GenStatsHeaders['PCT_MRNA_BASES'] = {
            'title': '% mRNA',
            'description': 'Percent of aligned bases overlapping UTRs and coding regions of mRNA transcripts',
            'max': 100,
            'min': 0,
            'suffix': '%',
            'scale': 'Greens',
        }
        self.general_stats_addcols(self.picard_RnaSeqMetrics_data, GenStatsHeaders)

        # Bar plot of bases assignment
        bg_cats = OrderedDict()
        bg_cats['CODING_BASES'] = { 'name': 'Coding' }
        bg_cats['UTR_BASES'] = { 'name': 'UTR' }
        bg_cats['INTRONIC_BASES'] = { 'name': 'Intronic' }
        bg_cats['INTERGENIC_BASES'] = { 'name': 'Intergenic' }
        bg_cats['RIBOSOMAL_BASES'] = { 'name': 'Ribosomal' }
        bg_cats['PF_NOT_ALIGNED_BASES'] = { 'name': 'PF not aligned' }
        self.add_section (
            name = 'RnaSeqMetrics Assignment',
            anchor = 'picard-rna-assignment',
            description = 'Number of bases in primary alignments that align to regions in the reference genome.',
            plot = bargraph.plot(self.picard_RnaSeqMetrics_data, bg_cats)
        )

        # Section with histogram plot
        if len(self.picard_RnaSeqMetrics_histogram) > 0:
            # Plot the data and add section
            pconfig = {
                'smooth_points': 500,
                'smooth_points_sumcounts': [True, False],
                'id': 'picard_rna_coverage',
                'title': 'Normalized Coverage',
                'ylab': 'Coverage',
                'xlab': 'Percent through gene',
                'xDecimals': False,
                'tt_label': '<b>{point.x}%</b>: {point.y:.0f}',
                'ymin': 0,
            }
            self.add_section (
                name = 'Gene Coverage',
                anchor = 'picard-rna-coverage',
                plot = linegraph.plot(self.picard_RnaSeqMetrics_histogram, pconfig)
            )


    # Return the number of detected samples to the parent module
    return len(self.picard_RnaSeqMetrics_data)
Example #53
0
    def __init__(self):
        super(MultiqcModule, self).__init__(
            name='MappingQC',
            anchor='mappingqc',
            href="https://github.com/imgag/ngs-bits",
            info="calculates QC metrics based on mapped NGS reads.")

        # quality parameters from qcML with name, accession, description
        self.qcml = dict()
        # qc data for each sample
        self.qcdata = dict()
        # parse qcml files
        for f in self.find_log_files('mappingqc',
                                     filecontents=True,
                                     filehandles=False):
            self.add_data_source(f)
            s_name = self.clean_s_name(f['s_name'], f['root'])
            self.qcdata[s_name] = self.parse_qcml(f['f'])

        # ignore samples if requested
        self.qcdata = self.ignore_samples(self.qcdata)

        # warn if no samples found
        if len(self.qcdata) == 0:
            raise UserWarning

        # add bases usable key, derived from bases usable (MB)
        self.qcml.pop('bases usable (MB)')
        self.qcml['bases usable'] = dict()
        self.qcml['bases usable']['description'] = 'Bases sequenced in total.'
        for s, kv in self.qcdata.items():
            kv['bases usable'] = kv['bases usable (MB)'] * 1e6
            kv.pop('bases usable (MB)')

        # prepare table headers, use name and description from qcML
        headers = {
            qp_key: {
                'namespace': "MappingQC",
                'title': qp_key,
                'description': qp_entry['description'],
            }
            for qp_key, qp_entry in self.qcml.items()
        }

        headers['trimmed base %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'floor': 1,
            'scale': 'PuBu'
        })
        headers['clipped base %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'floor': 1,
            'scale': 'PuRd'
        })
        headers['mapped read %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'max': 100,
            'scale': 'Reds'
        })
        headers['bases usable'].update({
            'suffix': config.base_count_prefix,
            'format': '{:,.2f}',
            'modify': lambda x: x * config.base_count_multiplier,
            'scale': 'Greens'
        })
        # always available, even without target file
        headers['on-target read %'].update({
            'suffix': '%',
            'format': '{:,.2f}',
            'max': 100,
            'scale': 'Purples'
        })

        # only available if duplicates marked
        try:
            headers['duplicate read %'].update({
                'suffix': '%',
                'format': '{:,.2f}',
                'max': 100,
                'scale': 'YlOrRd'
            })
        except KeyError:
            pass

        # only available if paired-end
        try:
            headers['properly-paired read %'].update({
                'suffix': '%',
                'format': '{:,.2f}',
                'max': 100,
                'scale': 'GnBu'
            })
            headers['insert size'].update({
                'suffix': 'bp',
                'format': '{:,.2f}',
                'scale': 'RdYlGn'
            })
        except KeyError:
            pass

        # only available if human
        try:
            headers['SNV allele frequency deviation'].update({
                'suffix': '',
                'format': '{:,.2f}',
                'floor': 0,
                'ceiling': 10,
                'minRange': 10,
                'scale': 'Greys'
            })
        except KeyError:
            pass

        # only available if target file provided
        coverage_values = (10, 20, 30, 50, 100, 200, 500)
        try:
            headers['target region read depth'].update({
                'suffix': 'x',
                'format': '{:,.2f}'
            })
            for x in coverage_values:
                headers['target region {:d}x %'.format(x)]. \
                    update({'suffix': '%', 'format': '{:,.2f}', 'max': 100, 'scale': 'YlGn'})
        except KeyError:
            pass

        # general table: add read count and bases usable
        self.general_stats_addcols(
            self.qcdata,
            self.dict_ordered_subset(
                headers, ('bases usable', 'mapped read %', 'on-target read %',
                          'target region read depth')))

        # write full data set to file
        self.write_data_file(self.qcdata, 'multiqc_mappingqc')

        # table with general values
        self.add_section(
            name='Overview',
            anchor='mappingqc-general',
            description='',
            plot=table.plot(
                self.qcdata,
                self.dict_ordered_subset(
                    headers,
                    ('bases usable', 'on-target read %', 'mapped read %',
                     'properly-paired read %', 'trimmed base %',
                     'clipped base %', 'duplicate read %', 'insert size',
                     'SNV allele frequency deviation')),
                pconfig={'namespace': 'MappingQC'}))

        if 'target region 10x %' in headers.keys():
            # table with coverage values
            self.add_section(
                name='Coverage',
                anchor='mappingqc-coverage',
                description='',
                plot=table.plot(
                    self.qcdata,
                    self.dict_ordered_subset(
                        headers,
                        ('target region read depth', 'target region 10x %',
                         'target region 20x %', 'target region 30x %',
                         'target region 50x %', 'target region 100x %',
                         'target region 200x %', 'target region 500x %')),
                    pconfig={'namespace': 'MappingQC'}))

            # bar plot with sequencing depth values
            self.add_section(
                name='Sequencing Depth',
                anchor='mappingqc-read-depth',
                description=self.make_description(['target region read depth'
                                                   ]),
                plot=bargraph.plot(
                    self.qcdata,
                    self.dict_ordered_subset(headers,
                                             ('target region read depth', )),
                    pconfig={
                        'namespace': 'MappingQC',
                        'id': 'mappingqc-read-depth-plot',
                        'title': 'MappingQC: Target Region Sequencing Depth',
                        'ylab': 'coverage',
                        'cpswitch': False,
                        'tt_decimals': 2,
                        'tt_suffix': 'x',
                        'tt_percentages': False
                    }))

            # bar plot with coverage values
            self.add_section(
                name='Target Coverage',
                anchor='mappingqc-target-coverage',
                description='',
                plot=bargraph.plot([self.qcdata] * len(coverage_values),
                                   [{
                                       s: headers[s]
                                   } for s in [
                                       'target region {:d}x %'.format(x)
                                       for x in coverage_values
                                   ]],
                                   pconfig={
                                       'namespace':
                                       'MappingQC',
                                       'id':
                                       'mappingqc-target-coverage-plot',
                                       'title':
                                       'MappingQC: Target Coverage Percentage',
                                       'ylab':
                                       'target coverage percentage',
                                       'cpswitch':
                                       False,
                                       'data_labels': [
                                           '{:d}x coverage %'.format(x)
                                           for x in coverage_values
                                       ],
                                       'ymin':
                                       0,
                                       'ymax':
                                       100,
                                       'use_legend':
                                       False,
                                       'tt_decimals':
                                       2,
                                       'tt_suffix':
                                       '%',
                                       'tt_percentages':
                                       False
                                   }))
Example #54
0
def parse_reports(self):
    """ Find RSeQC junction_annotation reports and parse their data """

    # Set up vars
    self.junction_annotation_data = dict()
    regexes = {
        "total_splicing_events": r"^Total splicing  Events:\s*(\d+)$",
        "known_splicing_events": r"^Known Splicing Events:\s*(\d+)$",
        "partial_novel_splicing_events":
        r"^Partial Novel Splicing Events:\s*(\d+)$",
        "novel_splicing_events": r"^Novel Splicing Events:\s*(\d+)$",
        "total_splicing_junctions": r"^Total splicing  Junctions:\s*(\d+)$",
        "known_splicing_junctions": r"^Known Splicing Junctions:\s*(\d+)$",
        "partial_novel_splicing_junctions":
        r"^Partial Novel Splicing Junctions:\s*(\d+)$",
        "novel_splicing_junctions": r"^Novel Splicing Junctions:\s*(\d+)$",
    }

    # Go through files and parse data using regexes
    for f in self.find_log_files("rseqc/junction_annotation"):
        d = dict()
        for k, r in regexes.items():
            r_search = re.search(r, f["f"], re.MULTILINE)
            if r_search:
                d[k] = int(r_search.group(1))

        # Calculate some percentages
        if "total_splicing_events" in d:
            t = float(d["total_splicing_events"])
            if "known_splicing_events" in d:
                d["known_splicing_events_pct"] = (
                    float(d["known_splicing_events"]) / t) * 100.0
            if "partial_novel_splicing_events" in d:
                d["partial_novel_splicing_events_pct"] = (
                    float(d["partial_novel_splicing_events"]) / t) * 100.0
            if "novel_splicing_events" in d:
                d["novel_splicing_events_pct"] = (
                    float(d["novel_splicing_events"]) / t) * 100.0
        if "total_splicing_junctions" in d:
            t = float(d["total_splicing_junctions"])
            if "known_splicing_junctions" in d:
                d["known_splicing_junctions_pct"] = (
                    float(d["known_splicing_junctions"]) / t) * 100.0
            if "partial_novel_splicing_junctions" in d:
                d["partial_novel_splicing_junctions_pct"] = (
                    float(d["partial_novel_splicing_junctions"]) / t) * 100.0
            if "novel_splicing_junctions" in d:
                d["novel_splicing_junctions_pct"] = (
                    float(d["novel_splicing_junctions"]) / t) * 100.0

        if len(d) > 0:
            if f["s_name"] in self.junction_annotation_data:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f["s_name"]))
            self.add_data_source(f, section="junction_annotation")
            self.junction_annotation_data[f["s_name"]] = d

    # Filter to strip out ignored sample names
    self.junction_annotation_data = self.ignore_samples(
        self.junction_annotation_data)

    if len(self.junction_annotation_data) > 0:

        # Write to file
        self.write_data_file(self.junction_annotation_data,
                             "multiqc_rseqc_junction_annotation")

        # Plot junction annotations
        keys = [OrderedDict(), OrderedDict()]
        keys[0]["known_splicing_junctions"] = {
            "name": "Known Splicing Junctions"
        }
        keys[0]["partial_novel_splicing_junctions"] = {
            "name": "Partial Novel Splicing Junctions"
        }
        keys[0]["novel_splicing_junctions"] = {
            "name": "Novel Splicing Junctions"
        }
        keys[1]["known_splicing_events"] = {"name": "Known Splicing Events"}
        keys[1]["partial_novel_splicing_events"] = {
            "name": "Partial Novel Splicing Events"
        }
        keys[1]["novel_splicing_events"] = {"name": "Novel Splicing Events"}

        pconfig = {
            "id": "rseqc_junction_annotation_junctions_plot",
            "title": "RSeQC: Splicing Junctions",
            "ylab": "% Junctions",
            "cpswitch_c_active": False,
            "data_labels": ["Junctions", "Events"],
        }
        self.add_section(
            name="Junction Annotation",
            anchor="rseqc_junction_annotation",
            description=
            '<a href="http://rseqc.sourceforge.net/#junction-annotation-py" target="_blank">Junction annotation</a>'
            " compares detected splice junctions to"
            " a reference gene model. An RNA read can be spliced 2"
            " or more times, each time is called a splicing event.",
            plot=bargraph.plot(
                [self.junction_annotation_data, self.junction_annotation_data],
                keys, pconfig),
        )

    # Return number of samples found
    return len(self.junction_annotation_data)
Example #55
0
    def __init__(self):

        # Initialise the parent object
        super(MultiqcModule, self).__init__(
            name='Long Ranger',
            anchor='longranger',
            href="https://www.10xgenomics.com/",
            info=
            "A set of analysis pipelines that perform sample demultiplexing, "
            "barcode processing, alignment, quality control, variant calling, phasing, "
            "and structural variant calling.")

        self.headers = OrderedDict()
        self.headers['large_sv_calls'] = {
            'title': 'Large SVs',
            'description':
            'Large structural variants called by Longranger. Not including blacklisted regions.',
            'format': '{:,.0f}',
            'scale': 'PuRd'
        }
        self.headers['short_deletion_calls'] = {
            'title': 'Short dels',
            'description': 'Short deletions called by Longranger.',
            'format': '{:,.0f}',
            'scale': 'PuRd',
            'hidden': True
        }
        self.headers['genes_phased_lt_100kb'] = {
            'title': 'genes phased < 100kb',
            'description':
            'Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.',
            'modify': lambda x: float(x) * 100.0,
            'suffix': '%',
            'scale': 'YlOrRd',
            'hidden': True
        }
        self.headers['longest_phase_block'] = {
            'title': 'Longest phased',
            'description': 'Size of the longest phase block, in base pairs',
            'scale': 'YlOrRd',
            'modify': lambda x: float(x) / 1000000.0,
            'suffix': 'Mbp',
            'hidden': True
        }
        self.headers['n50_phase_block'] = {
            'title': 'N50 phased',
            'description':
            'N50 length of the called phase blocks, in base pairs.',
            'modify': lambda x: float(x) / 1000000.0,
            'suffix': 'Mbp',
            'scale': 'YlOrRd',
            'hidden': True
        }
        self.headers['snps_phased'] = {
            'title': 'SNPs phased',
            'description': 'Percentage of called SNPs that were phased.',
            'modify': lambda x: float(x) * 100.0,
            'suffix': '%',
            'scale': 'PuRd',
            'hidden': True
        }
        self.headers['median_insert_size'] = {
            'title': 'Insert size',
            'description': 'Median insert size of aligned read pairs.',
            'format': '{:,.0f}',
            'suffix': 'bp',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['on_target_bases'] = {
            'title': 'On target',
            'description':
            'Percentage of aligned bases mapped with the target regions in targeted mode. Only bases inside the intervals of target BED file are counted.',
            'suffix': '%',
            'modify': lambda x: 0 if x == "" else float(x) * 100.0,
            'scale': 'Greens'
        }
        self.headers['zero_coverage'] = {
            'title': 'Zero cov',
            'description':
            'Percentage of non-N bases in the genome with zero coverage.',
            'modify': lambda x: float(x) * 100.0,
            'suffix': '%',
            'max': 100.0,
            'min': 0.0,
            'scale': 'RdGy-rev'
        }
        self.headers['mean_depth'] = {
            'title': 'Depth',
            'description':
            'Mean read depth, including PCR duplicate reads. In WGS mode, this is measured across the genome; in targeted mode, this is the measure inside targeted regions.',
            'suffix': 'X',
            'scale': 'PuBu'
        }
        self.headers['pcr_duplication'] = {
            'title': 'PCR Dup',
            'description':
            'Percentage of reads marked as PCR duplicates. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.',
            'suffix': '%',
            'min': 15.0,
            'modify': lambda x: float(x) * 100.0,
            'scale': 'RdGy-rev',
            'hidden': True
        }
        self.headers['mapped_reads'] = {
            'title': 'Mapped',
            'modify': lambda x: float(x) * 100.0,
            'suffix': '%',
            'description':
            'Percentage of input reads that were mapped to the reference genome.',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['number_reads'] = {
            'title': 'M Reads',
            'modify': lambda x: float(x) / 1000000.0,
            'description':
            'Total number of reads supplied to Long Ranger. (millions)',
            'scale': 'PuBu',
            'hidden': True
        }
        self.headers['molecule_length_mean'] = {
            'title': 'Mol size',
            'description':
            'The length-weighted mean input DNA length in base pairs.',
            'modify': lambda x: float(x) / 1000.0,
            'suffix': 'Kbp',
            'scale': 'YlGn'
        }
        self.headers['molecule_length_stddev'] = {
            'title': 'Mol stddev',
            'description':
            'The length-weighted standard deviation of the input DNA length distribution in base pairs.',
            'modify': lambda x: float(x) / 1000.0,
            'suffix': 'Kbp',
            'scale': 'YlGn',
            'hidden': True
        }
        self.headers['n50_linked_reads_per_molecule'] = {
            'title': 'N50 read per mol.',
            'description':
            'The N50 number of read-pairs per input DNA molecule. Half of read-pairs came from molecules with this many or greater read-pairs.',
            'scale': 'BuGn',
            'hidden': True
        }
        self.headers['r1_q30_bases_fract'] = {
            'title': '% R1 >= Q30',
            'description':
            'Percentage of bases in R1 with base quality >= 30.',
            'hidden': True,
            'suffix': '%',
            'modify': lambda x: float(x) * 100.0,
            'scale': 'Purples'
        }
        self.headers['r2_q30_bases_fract'] = {
            'title': '% R2 >= Q30',
            'description':
            'Percentage of bases in R2 with base quality >= 30.',
            'suffix': '%',
            'modify': lambda x: float(x) * 100.0,
            'scale': 'Purples',
            'hidden': True
        }
        self.headers['bc_on_whitelist'] = {
            'title': 'Valid BCs',
            'description':
            'The Percentage of reads that carried a valid 10x barcode sequence.',
            'modify': lambda x: float(x) * 100.0,
            'suffix': '%',
            'scale': 'BuPu',
            'hidden': True,
        }
        self.headers['bc_q30_bases_fract'] = {
            'title': 'BC Q30',
            'description':
            'Percentage of bases in the barcode with base quality >= 30.',
            'suffix': '%',
            'modify': lambda x: float(x) * 100.0,
            'scale': 'Purples',
            'hidden': True
        }
        self.headers['bc_mean_qscore'] = {
            'title': 'BC Qscore',
            'description': 'The mean base quality value on the barcode bases.',
            'scale': 'BuPu',
            'hidden': True
        }
        self.headers['mean_dna_per_gem'] = {
            'title': 'DNA per gem',
            'description':
            'The average number of base pairs of genomic DNA loaded into each GEM. This metric is based on the observed extents of read-pairs on each molecule.',
            'modify': lambda x: float(x) / 1000000.0,
            'suffix': 'Mbp',
            'scale': 'OrRd',
            'hidden': True
        }
        self.headers['gems_detected'] = {
            'title': 'M Gems',
            'description':
            'The number of Chromium GEMs that were collected and which generated a non-trivial number of read-pairs. (millions)',
            'modify': lambda x: float(x) / 1000000.0,
            'scale': 'OrRd',
        }
        self.headers['corrected_loaded_mass_ng'] = {
            'title': 'Loaded (corrected)',
            'description':
            'The estimated number of nanograms of DNA loaded into the input well of the Chromium chip. This metric is calculated by measuring the mean amount of DNA covered by input molecules in each GEM, then multiplying by the ratio of the chip input to the sample volume in each GEM.',
            'suffix': 'ng',
            'scale': 'RdYlGn'
        }
        self.headers['loaded_mass_ng'] = {
            'title': 'Loaded',
            'description':
            'This metric was found to overestimate the true loading by a factor of 1.6, due primarily to denaturation of the input DNA.',
            'suffix': 'ng',
            'scale': 'RdYlGn'
        }
        self.headers['instrument_ids'] = {
            'title': 'Instrument ID',
            'description':
            'The list of instrument IDs used to generate the input reads.',
            'scale': False,
            'hidden': True
        }
        self.headers['longranger_version'] = {
            'title': 'Long Ranger Version',
            'description':
            'The version of the Longranger software used to generate the results.',
            'scale': False
        }

        ### Parse the data
        self.longranger_data = dict()
        self.paths_dict = dict()
        for f in self.find_log_files('longranger/invocation'):
            sid = self.parse_invocation(f['f'])
            self.paths_dict[os.path.basename(f['root'])] = sid

        running_name = 1
        for f in self.find_log_files('longranger/summary'):
            data = self.parse_summary(f['f'])
            updir, _ = os.path.split(f['root'])
            base_updir = os.path.basename(updir)
            sid = 'longranger#{}'.format(running_name)
            if base_updir in self.paths_dict.keys():
                sid = self.paths_dict[base_updir]
            else:
                log.debug('Did not find _invocation file: {}'.format(f['fn']))
                running_name += 1

            self.longranger_data[sid] = data

        # Filter to strip out ignored sample names
        self.longranger_data = self.ignore_samples(self.longranger_data)

        if len(self.longranger_data) == 0:
            raise UserWarning
        log.info("Found {} reports".format(len(self.longranger_data.keys())))

        # Write parsed report data to a file
        self.write_data_file(self.longranger_data, 'multiqc_longranger')

        # Add a longranger versions column if not all the same
        longranger_versions = set(
            [d['longranger_version'] for d in self.longranger_data.values()])
        version_str = ''
        if len(longranger_versions) == 1:
            version_str = " All samples were processed using Longranger version {}".format(
                list(longranger_versions)[0])
            del (self.headers['longranger_version'])

        ### Write the table
        config_table = {'id': 'longranger_table', 'namespace': 'longranger'}
        self.add_section (
            name = 'Run stats',
            anchor = 'longranger-run-stats',
            description = 'Statistics gathered from Longranger reports. ' \
                    'There are more columns available but they are hidden by default.' + version_str,
            helptext = '''Parses the files `summary.csv` and `_invocation` found in the
                    output directory of Longranger. If `_invocation` is not found
                    the sample IDs will be missing and they will be given a running
                    number. E.g., `longranger#1` and `longranger#2`.''',
            plot = table.plot(self.longranger_data, self.headers, config_table)
        )

        ### Bar plot of phasing stats
        snps_phased_pct = {}
        genes_phased_pct = {}
        for s_name in self.longranger_data:
            snps_phased_pct[s_name] = {
                'snps_phased_pct':
                float(self.longranger_data[s_name]['snps_phased']) * 100.0
            }
            genes_phased_pct[s_name] = {
                'genes_phased_pct':
                float(self.longranger_data[s_name]['genes_phased_lt_100kb']) *
                100.0
            }
        phase_plot_cats = [OrderedDict(), OrderedDict(), OrderedDict()]
        phase_plot_cats[0]['longest_phase_block'] = {
            'name': 'Longest Phase Block'
        }
        phase_plot_cats[0]['n50_phase_block'] = {'name': 'N50 of Phase Blocks'}
        phase_plot_cats[1]['snps_phased_pct'] = {'name': '% SNPs Phased'}
        phase_plot_cats[2]['genes_phased_pct'] = {
            'name': '% Genes < 100kbp in a single phase block'
        }
        self.add_section(
            name='Phasing',
            anchor='longranger-phasing',
            description=
            'Phasing performance from Long Ranger. Genes are only considered if &le; 100kbp in length and with at least one heterozygous SNP.',
            helptext='''
                    * Longest phased
                        * Size of the longest phase block, in base pairs
                    * N50 phased
                        * N50 length of the called phase blocks, in base pairs.
                    * % SNPs phased
                        * Percentage of called SNPs that were phased.
                    * % Genes Phased
                        * Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.
                    ''',
            plot=bargraph.plot(
                [self.longranger_data, snps_phased_pct, genes_phased_pct],
                phase_plot_cats, {
                    'id':
                    'longranger-phasing-plot',
                    'title':
                    'Long Ranger: Phasing Statistics',
                    'data_labels': [{
                        'name': 'N50 Phased',
                        'ylab': 'N50 of called phase blocks (bp)'
                    }, {
                        'name': '% SNPs Phased',
                        'ylab': '% SNPs Phased',
                        'ymax': 100
                    }, {
                        'name': '% Genes Phased',
                        'ylab': '% Genes Phased',
                        'ymax': 100
                    }],
                    'cpswitch':
                    False,
                    'stacking':
                    None,
                    'ylab':
                    'N50 of called phase blocks (bp)'
                }))

        ### Bar plot of mapping statistics
        mapping_counts_data = {}
        for s_name in self.longranger_data:
            mapped_reads = float(
                self.longranger_data[s_name]['number_reads']) * float(
                    self.longranger_data[s_name]['mapped_reads'])
            unmapped_reads = float(
                self.longranger_data[s_name]['number_reads']) - mapped_reads
            dup_reads = mapped_reads * float(
                self.longranger_data[s_name]['pcr_duplication'])
            unique_reads = mapped_reads - dup_reads
            mapping_counts_data[s_name] = {
                'unique_reads': unique_reads,
                'dup_reads': dup_reads,
                'unmapped_reads': unmapped_reads
            }
        mapping_counts_cats = OrderedDict()
        mapping_counts_cats['unique_reads'] = {
            'name': 'Uniquely Aligned Reads',
            'color': '#437bb1'
        }
        mapping_counts_cats['dup_reads'] = {
            'name': 'PCR Duplicate Aligned Reads',
            'color': '#7cb5ec'
        }
        mapping_counts_cats['unmapped_reads'] = {
            'name': 'Unaligned Reads',
            'color': '#7f0000'
        }
        self.add_section(
            name='Alignment',
            anchor='longranger-alignment',
            description=
            'Long Ranger alignment against the reference genome. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.',
            plot=bargraph.plot(
                mapping_counts_data, mapping_counts_cats, {
                    'id': 'longranger-alignment-plot',
                    'title': 'Long Ranger: Alignment Statistics',
                    'ylab': 'Reads Counts',
                    'cpswitch_counts_label': 'Read Counts',
                }))
Example #56
0
    def top_five_barplot(self):
        """ Add a bar plot showing the top-5 from each taxa rank """

        pd = []
        cats = list()
        pconfig = {
            "id": "kraken-topfive-plot",
            "title": "Kraken 2: Top taxa",
            "ylab": "Number of fragments",
            "data_labels": list(self.t_ranks.values()),
        }

        for rank_code, rank_name in self.t_ranks.items():
            rank_cats = OrderedDict()
            rank_data = dict()

            # Loop through the summed tax percentages to get the top 5 across all samples
            try:
                sorted_pct = sorted(self.kraken_total_pct[rank_code].items(),
                                    key=lambda x: x[1],
                                    reverse=True)
            except KeyError:
                # Taxa rank not found in this sample
                continue
            i = 0
            counts_shown = {}
            for classif, pct_sum in sorted_pct:
                i += 1
                if i > 5:
                    break
                rank_cats[classif] = {"name": classif}
                # Pull out counts for this rank + classif from each sample
                for s_name, d in self.kraken_raw_data.items():
                    if s_name not in rank_data:
                        rank_data[s_name] = dict()
                    if s_name not in counts_shown:
                        counts_shown[s_name] = 0
                    for row in d:
                        if row["rank_code"] == rank_code:
                            if row["classif"] == classif:
                                if classif not in rank_data[s_name]:
                                    rank_data[s_name][classif] = 0
                                rank_data[s_name][classif] += row[
                                    "counts_rooted"]
                                counts_shown[s_name] += row["counts_rooted"]

            # Add in unclassified reads and "other" - we presume from other species etc.
            for s_name, d in self.kraken_raw_data.items():
                for row in d:
                    if row["rank_code"] == "U":
                        rank_data[s_name]["U"] = row["counts_rooted"]
                        counts_shown[s_name] += row["counts_rooted"]
                rank_data[s_name][
                    "other"] = self.kraken_sample_total_readcounts[
                        s_name] - counts_shown[s_name]

                # This should never happen... But it does sometimes if the total read count is a bit off
                if rank_data[s_name]["other"] < 0:
                    log.debug(
                        "Found negative 'other' count for {} ({}): {}".format(
                            s_name, self.t_ranks[rank_code],
                            rank_data[s_name]["other"]))
                    rank_data[s_name]["other"] = 0

            rank_cats["other"] = {"name": "Other", "color": "#cccccc"}
            rank_cats["U"] = {"name": "Unclassified", "color": "#d4949c"}

            cats.append(rank_cats)
            pd.append(rank_data)

        self.add_section(
            name="Top taxa",
            anchor="kraken-topfive",
            description=
            "The number of reads falling into the top 5 taxa across different ranks.",
            helptext="""
                To make this plot, the percentage of each sample assigned to a given taxa is summed across all samples.
                The counts for these top five taxa are then plotted for each of the 9 different taxa ranks.
                The unclassified count is always shown across all taxa ranks.

                The total number of reads is approximated by dividing the number of `unclassified` reads by the percentage of
                the library that they account for.
                Note that this is only an approximation, and that kraken percentages don't always add to exactly 100%.

                The category _"Other"_ shows the difference between the above total read count and the sum of the read counts
                in the top 5 taxa shown + unclassified. This should cover all taxa _not_ in the top 5, +/- any rounding errors.

                Note that any taxon that does not exactly fit a taxon rank (eg. `-` or `G2`) is ignored.
            """,
            plot=bargraph.plot(pd, cats, pconfig),
        )
Example #57
0
def parse_reports(self):
    """Find RSeQC read_distribution reports and parse their data"""

    # Set up vars
    self.read_dist = dict()
    first_regexes = {
        "total_reads": r"Total Reads\s+(\d+)\s*",
        "total_tags": r"Total Tags\s+(\d+)\s*",
        "total_assigned_tags": r"Total Assigned Tags\s+(\d+)\s*",
    }
    second_regexes = {
        "cds_exons": r"CDS_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "5_utr_exons": r"5'UTR_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "3_utr_exons": r"3'UTR_Exons\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "introns": r"Introns\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tss_up_1kb": r"TSS_up_1kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tss_up_5kb": r"TSS_up_5kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tss_up_10kb": r"TSS_up_10kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tes_down_1kb": r"TES_down_1kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tes_down_5kb": r"TES_down_5kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
        "tes_down_10kb": r"TES_down_10kb\s+(\d+)\s+(\d+)\s+([\d\.]+)\s*",
    }

    # Go through files and parse data using regexes
    for f in self.find_log_files("rseqc/read_distribution"):
        d = dict()
        for k, r in first_regexes.items():
            r_search = re.search(r, f["f"], re.MULTILINE)
            if r_search:
                d[k] = int(r_search.group(1))
        for k, r in second_regexes.items():
            r_search = re.search(r, f["f"], re.MULTILINE)
            if r_search:
                d["{}_total_bases".format(k)] = int(r_search.group(1))
                d["{}_tag_count".format(k)] = int(r_search.group(2))
                d["{}_tags_kb".format(k)] = float(r_search.group(3))

        d["other_intergenic_tag_count"] = d["total_tags"] - d[
            "total_assigned_tags"]

        # Calculate some percentages for parsed file
        if "total_tags" in d:
            t = float(d["total_tags"])
            pcts = dict()
            for k in d:
                if k.endswith("_tag_count"):
                    pk = "{}_tag_pct".format(k[:-10])
                    pcts[pk] = (float(d[k]) / t) * 100.0
            d.update(pcts)

        if len(d) > 0:
            if f["s_name"] in self.read_dist:
                log.debug(
                    "Duplicate sample name found! Overwriting: {}".format(
                        f["s_name"]))
            self.add_data_source(f, section="read_distribution")
            self.read_dist[f["s_name"]] = d

    # Filter to strip out ignored sample names
    self.read_dist = self.ignore_samples(self.read_dist)

    if len(self.read_dist) > 0:

        # Write to file
        self.write_data_file(self.read_dist, "multiqc_rseqc_read_distribution")

        # Plot bar graph of groups
        keys = OrderedDict()
        keys["cds_exons_tag_count"] = {"name": "CDS_Exons"}
        keys["5_utr_exons_tag_count"] = {"name": "5'UTR_Exons"}
        keys["3_utr_exons_tag_count"] = {"name": "3'UTR_Exons"}
        keys["introns_tag_count"] = {"name": "Introns"}
        keys["tss_up_1kb_tag_count"] = {"name": "TSS_up_1kb"}
        keys["tss_up_5kb_tag_count"] = {"name": "TSS_up_5kb"}
        keys["tss_up_10kb_tag_count"] = {"name": "TSS_up_10kb"}
        keys["tes_down_1kb_tag_count"] = {"name": "TES_down_1kb"}
        keys["tes_down_5kb_tag_count"] = {"name": "TES_down_5kb"}
        keys["tes_down_10kb_tag_count"] = {"name": "TES_down_10kb"}
        keys["other_intergenic_tag_count"] = {"name": "Other_intergenic"}

        # Config for the plot
        pconfig = {
            "id": "rseqc_read_distribution_plot",
            "title": "RSeQC: Read Distribution",
            "ylab": "# Tags",
            "cpswitch_counts_label": "Number of Tags",
            "cpswitch_c_active": False,
        }

        self.add_section(
            name="Read Distribution",
            anchor="rseqc-read_distribution",
            description=
            '<a href="http://rseqc.sourceforge.net/#read-distribution-py" target="_blank">Read Distribution</a>'
            " calculates how mapped reads are distributed over genome features.",
            plot=bargraph.plot(self.read_dist, keys, pconfig),
        )

    # Return number of samples found
    return len(self.read_dist)
Example #58
0
def parse_reports(self):
    """ Find Qualimap RNASeq reports and parse their data """

    self.qualimap_rnaseq_genome_results = dict()
    regexes = {
        'reads_aligned': r"read(?:s| pairs) aligned\s*=\s*([\d,]+)",
        'total_alignments': r"total alignments\s*=\s*([\d,]+)",
        'non_unique_alignments': r"non-unique alignments\s*=\s*([\d,]+)",
        'reads_aligned_genes': r"aligned to genes\s*=\s*([\d,]+)",
        'ambiguous_alignments': r"ambiguous alignments\s*=\s*([\d,]+)",
        'not_aligned': r"not aligned\s*=\s*([\d,]+)",
        '5_3_bias': r"5'-3' bias\s*=\s*(\d+\.\d+)",
        'reads_aligned_exonic': r"exonic\s*=\s*([\d,]+)",
        'reads_aligned_intronic': r"intronic\s*=\s*([\d,]+)",
        'reads_aligned_intergenic': r"intergenic\s*=\s*([\d,]+)",
        'reads_aligned_overlapping_exon': r"overlapping exon\s*=\s*([\d,]+)",
    }
    for f in self.find_log_files('qualimap/rnaseq/rnaseq_results'):
        d = dict()

        # Get the sample name
        s_name_regex = re.search(r"bam file\s*=\s*(.+)", f['f'], re.MULTILINE)
        if s_name_regex:
            d['bam_file'] = s_name_regex.group(1)
            s_name = self.clean_s_name(d['bam_file'], f['root'])
        else:
            log.warn(
                "Couldn't find an input filename in genome_results file {}/{}".
                format(f['root'], f['fn']))
            return None

        # Check for and 'fix' European style decimal places / thousand separators
        comma_regex = re.search(r"exonic\s*=\s*[\d\.]+ \(\d{1,3},\d+%\)",
                                f['f'], re.MULTILINE)
        if comma_regex:
            log.debug(
                "Trying to fix European comma style syntax in Qualimap report {}/{}"
                .format(f['root'], f['fn']))
            f['f'] = f['f'].replace('.', '')
            f['f'] = f['f'].replace(',', '.')

        # Go through all numeric regexes
        for k, r in regexes.items():
            r_search = re.search(r, f['f'], re.MULTILINE)
            if r_search:
                try:
                    d[k] = float(r_search.group(1).replace(',', ''))
                except UnicodeEncodeError:
                    # Qualimap reports infinity (\u221e) when 3' bias denominator is zero
                    pass
                except ValueError:
                    d[k] = r_search.group(1)

        # Add to general stats table
        for k in ['5_3_bias', 'reads_aligned']:
            try:
                self.general_stats_data[s_name][k] = d[k]
            except KeyError:
                pass

        # Save results
        if s_name in self.qualimap_rnaseq_genome_results:
            log.debug(
                "Duplicate genome results sample name found! Overwriting: {}".
                format(s_name))
        self.qualimap_rnaseq_genome_results[s_name] = d
        self.add_data_source(f, s_name=s_name, section='rna_genome_results')

    #### Coverage profile
    self.qualimap_rnaseq_cov_hist = dict()
    for f in self.find_log_files('qualimap/rnaseq/coverage', filehandles=True):
        s_name = self.get_s_name(f)
        d = dict()
        for l in f['f']:
            if l.startswith('#'):
                continue
            coverage, count = l.split(None, 1)
            coverage = int(round(float(coverage)))
            count = float(count)
            d[coverage] = count

        if len(d) == 0:
            log.debug(
                "Couldn't parse contents of coverage histogram file {}".format(
                    f['fn']))
            return None

        # Save results
        if s_name in self.qualimap_rnaseq_cov_hist:
            log.debug(
                "Duplicate coverage histogram sample name found! Overwriting: {}"
                .format(s_name))
        self.qualimap_rnaseq_cov_hist[s_name] = d
        self.add_data_source(f,
                             s_name=s_name,
                             section='rna_coverage_histogram')

    # Filter to strip out ignored sample names
    self.qualimap_rnaseq_genome_results = self.ignore_samples(
        self.qualimap_rnaseq_genome_results)
    self.qualimap_rnaseq_cov_hist = self.ignore_samples(
        self.qualimap_rnaseq_cov_hist)

    #### Plots

    # Genomic Origin Bar Graph
    # NB: Ignore 'Overlapping Exon' in report - these make the numbers add up to > 100%
    if len(self.qualimap_rnaseq_genome_results) > 0:
        gorigin_cats = OrderedDict()
        gorigin_cats['reads_aligned_exonic'] = {'name': 'Exonic'}
        gorigin_cats['reads_aligned_intronic'] = {'name': 'Intronic'}
        gorigin_cats['reads_aligned_intergenic'] = {'name': 'Intergenic'}
        gorigin_pconfig = {
            'id': 'qualimap_genomic_origin',
            'title': 'Qualimap RNAseq: Genomic Origin',
            'ylab': 'Number of reads',
            'cpswitch_c_active': False
        }
        genomic_origin_helptext = '''
        There are currently three main approaches to map reads to transcripts in an
        RNA-seq experiment: mapping reads to a reference genome to identify expressed
        transcripts that are annotated (and discover those that are unknown), mapping
        reads to a reference transcriptome, and <i>de novo</i> assembly of transcript
        sequences (<a href="https://doi.org/10.1186/s13059-016-0881-8"
        target="_blank">Conesa et al. 2016</a>).

        For RNA-seq QC analysis, QualiMap can be used to assess alignments produced by
        the first of these approaches. For input, it requires a GTF annotation file
        along with a reference genome, which can be used to reconstruct the exon
        structure of known transcripts. This allows mapped reads to be grouped by
        whether they originate in an exonic region (for QualiMap, this may include
        5&#8242; and 3&#8242; UTR regions as well as protein-coding exons), an intron,
        or an intergenic region (see the <a href="http://qualimap.bioinfo.cipf.es/doc_html/index.html"
        target="_blank">Qualimap 2 documentation</a>).

        The inferred genomic origins of RNA-seq reads are presented here as a bar graph
        showing either the number or percentage of mapped reads in each read dataset
        that have been assigned to each type of genomic region. This graph can be used
        to assess the proportion of useful reads in an RNA-seq experiment. That
        proportion can be reduced by the presence of intron sequences, especially if
        depletion of ribosomal RNA was used during sample preparation (<a href="https://doi.org/10.1038/nrg3642"
        target="_blank">Sims et al. 2014</a>). It can also be reduced by off-target
        transcripts, which are detected in greater numbers at the sequencing depths
        needed to detect poorly-expressed transcripts (<a href="https://doi.org/10.1101/gr.124321.111"
        target="_blank">Tarazona et al. 2011</a>).'''
        self.add_section(
            name='Genomic origin of reads',
            anchor='qualimap-reads-genomic-origin',
            description=
            'Classification of mapped reads as originating in exonic, intronic or intergenic regions. These can be displayed as either the number or percentage of mapped reads.',
            helptext=genomic_origin_helptext,
            plot=bargraph.plot(self.qualimap_rnaseq_genome_results,
                               gorigin_cats, gorigin_pconfig))

    if len(self.qualimap_rnaseq_cov_hist) > 0:
        coverage_profile_helptext = '''
        There are currently three main approaches to map reads to transcripts in an
        RNA-seq experiment: mapping reads to a reference genome to identify expressed
        transcripts that are annotated (and discover those that are unknown), mapping
        reads to a reference transcriptome, and <i>de novo</i> assembly of transcript
        sequences (<a href="https://doi.org/10.1186/s13059-016-0881-8"
        target="_blank">Conesa et al. 2016</a>).

        For RNA-seq QC analysis, QualiMap can be used to assess alignments produced by
        the first of these approaches. For input, it requires a GTF annotation file
        along with a reference genome, which can be used to reconstruct the exon
        structure of known transcripts. QualiMap uses this information to calculate the
        depth of coverage along the length of each annotated transcript. For a set of
        reads mapped to a transcript, the depth of coverage at a given base position is
        the number of high-quality reads that map to the transcript at that position
        (<a href="https://doi.org/10.1038/nrg3642" target="_blank">Sims et al. 2014</a>).

        QualiMap calculates coverage depth at every base position of each annotated
        transcript. To enable meaningful comparison between transcripts, base positions
        are rescaled to relative positions expressed as percentage distance along each
        transcript (*0%, 1%, &#8230;, 99%*). For the set of transcripts with at least
        one mapped read, QualiMap plots the cumulative mapped-read depth (y-axis) at
        each relative transcript position (x-axis). This plot shows the gene coverage
        profile across all mapped transcripts for each read dataset. It provides a
        visual way to assess positional biases, such as an accumulation of mapped reads
        at the 3&#8242; end of transcripts, which may indicate poor RNA quality in the
        original sample (<a href="https://doi.org/10.1186/s13059-016-0881-8"
        target="_blank">Conesa et al. 2016</a>).'''
        self.add_section(
            name='Gene Coverage Profile',
            anchor='qualimap-genome-fraction-coverage',
            description=
            'Mean distribution of coverage depth across the length of all mapped transcripts.',
            helptext=coverage_profile_helptext,
            plot=linegraph.plot(
                self.qualimap_rnaseq_cov_hist, {
                    'id': 'qualimap_gene_coverage_profile',
                    'title':
                    'Qualimap RNAseq: Coverage Profile Along Genes (total)',
                    'ylab': 'Coverage',
                    'xlab': 'Transcript Position (%)',
                    'ymin': 0,
                    'xmin': 0,
                    'xmax': 100,
                    'tt_label': '<b>{point.x} bp</b>: {point.y:.0f}%',
                }))

    #### General Stats
    self.general_stats_headers['5_3_bias'] = {
        'title': "5'-3' bias",
        'format': '{:,.2f}',
    }
    self.general_stats_headers['reads_aligned'] = {
        'title': '{} Aligned'.format(config.read_count_prefix),
        'description': 'Reads Aligned ({})'.format(config.read_count_desc),
        'min': 0,
        'scale': 'RdBu',
        'shared_key': 'read_count',
        'modify': lambda x: x * config.read_count_multiplier
    }

    # Return the number of reports we found
    return len(self.qualimap_rnaseq_genome_results.keys())
Example #59
0
 def plot_bargraph (self, data, cats=None, pconfig={}):
     """ Depreciated function. Forwards to new location. """
     from multiqc.plots import bargraph
     return bargraph.plot(data, cats, pconfig)
Example #60
0
    def bowtie2_alignment_plot(self):
        """ Make the HighCharts HTML to plot the alignment rates """

        half_warning = ''
        for s_name in self.bowtie2_data:
            if 'paired_aligned_mate_one_halved' in self.bowtie2_data[
                    s_name] or 'paired_aligned_mate_multi_halved' in self.bowtie2_data[
                        s_name] or 'paired_aligned_mate_none_halved' in self.bowtie2_data[
                            s_name]:
                half_warning = '<em>Please note that single mate alignment counts are halved to tally with pair counts properly.</em>'
        description_text = 'This plot shows the number of reads aligning to the reference in different ways.'

        # Config for the plot
        config = {
            'ylab': '# Reads',
            'cpswitch_counts_label': 'Number of Reads'
        }

        # Two plots, don't mix SE with PE
        if self.num_se > 0:
            sekeys = OrderedDict()
            sekeys['unpaired_aligned_one'] = {
                'color': '#20568f',
                'name': 'SE mapped uniquely'
            }
            sekeys['unpaired_aligned_multi'] = {
                'color': '#f7a35c',
                'name': 'SE multimapped'
            }
            sekeys['unpaired_aligned_none'] = {
                'color': '#981919',
                'name': 'SE not aligned'
            }
            config['id'] = 'bowtie2_se_plot'
            config['title'] = 'Bowtie 2: SE Alignment Scores'
            self.add_section(description=description_text,
                             helptext='''
                There are 3 possible types of alignment:
                * **SE Mapped uniquely**: Read has only one occurence in the reference genome.
                * **SE Multimapped**: Read has multiple occurence.
                * **SE No aligned**: Read has no occurence.
                ''',
                             plot=bargraph.plot(self.bowtie2_data, sekeys,
                                                config))

        if self.num_pe > 0:
            pekeys = OrderedDict()
            pekeys['paired_aligned_one'] = {
                'color': '#20568f',
                'name': 'PE mapped uniquely'
            }
            pekeys['paired_aligned_discord_one'] = {
                'color': '#5c94ca',
                'name': 'PE mapped discordantly uniquely'
            }
            pekeys['paired_aligned_mate_one_halved'] = {
                'color': '#95ceff',
                'name': 'PE one mate mapped uniquely'
            }
            pekeys['paired_aligned_multi'] = {
                'color': '#f7a35c',
                'name': 'PE multimapped'
            }
            pekeys['paired_aligned_discord_multi'] = {
                'color': '#dce333',
                'name': 'PE discordantly multimapped'
            }
            pekeys['paired_aligned_mate_multi_halved'] = {
                'color': '#ffeb75',
                'name': 'PE one mate multimapped'
            }
            pekeys['paired_aligned_mate_none_halved'] = {
                'color': '#981919',
                'name': 'PE neither mate aligned'
            }
            config['id'] = 'bowtie2_pe_plot'
            config['title'] = 'Bowtie 2: PE Alignment Scores'
            self.add_section(
                description="<br>".join([description_text, half_warning]),
                helptext='''
                There are 6 possible types of alignment:
                * **PE mapped uniquely**: Pair has only one occurence in the reference genome.
                * **PE mapped discordantly uniquely**: Pair has only one occurence but not in proper pair.
                * **PE one mate mapped uniquely**: One read of a pair has one occurence.
                * **PE multimapped**: Pair has multiple occurence.
                * **PE one mate multimapped**: One read of a pair has multiple occurence.
                * **PE neither mate aligned**: Pair has no occurence.
                ''',
                plot=bargraph.plot(self.bowtie2_data, pekeys, config))