def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if modname == '' or modname is None: modname = 'Custom Content' # Initialise the parent object super(MultiqcModule, self).__init__( name = modname, anchor = mod['config'].get('section_anchor', c_id), href = mod['config'].get('section_href'), info = mod['config'].get('description') ) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') self.add_section( plot = table.plot(mod['data'], headers, pconfig) ) self.write_data_file( mod['data'], "multiqc_{}".format(modname.lower().replace(' ', '_')) ) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.add_section( plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) ) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.add_section( plot = linegraph.plot(mod['data'], pconfig) ) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.add_section( plot = scatter.plot(mod['data'], pconfig) ) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.add_section( plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) ) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.add_section( plot = beeswarm.plot(mod['data'], pconfig) ) # Raw HTML elif mod['config'].get('plot_type') == 'html': self.add_section( content = mod['data'] ) # Raw image file as html elif mod['config'].get('plot_type') == 'image': self.add_section( content = mod['data'] ) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning("Error - custom content plot type '{}' not recognised for content ID {}".format(mod['config'].get('plot_type'), c_id))
def parse_plotCorrelation(self): """Find plotCorrelation output""" self.deeptools_plotCorrelationData = dict() for f in self.find_log_files('deeptools/plotCorrelationData', filehandles=False): parsed_data, samples = self.parsePlotCorrelationData(f) for k, v in parsed_data.items(): if k in self.deeptools_plotCorrelationData: log.warning("Replacing duplicate sample {}.".format(k)) self.deeptools_plotCorrelationData[k] = v if len(parsed_data) > 0: self.add_data_source(f, section='plotCorrelation') if len(self.deeptools_plotCorrelationData) > 0: config = { 'id': 'deeptools_correlation_plot', 'title': 'deeptools: Correlation Plot', } data = [] for s_name in samples: try: data.append(self.deeptools_plotCorrelationData[s_name]) except KeyError: pass if len(data) == 0: log.debug('No valid data for correlation plot') return None self.add_section( name="Correlation heatmap", anchor="deeptools_correlation", description="Pairwise correlations of samples based on distribution of sequence reads", plot=heatmap.plot(data, samples, samples, config) ) return len(self.deeptools_plotCorrelationData)
def parse_relatedness2(self): matrices = {} for f in self.find_log_files('vcftools/relatedness2', filehandles=True): m = _Relatedness2Matrix(f) if m.data and m.x_labels and m.y_labels: matrices[f['s_name']] = m matrices = self.ignore_samples(matrices) log.info('Found %s valid relatedness2 matrices', len(matrices)) helptext = ''' `RELATEDNESS_PHI` gives a relatedness score between two samples. A higher score indicates a higher degree of relatedness, up to a maximum of 0.5. Samples are sorted alphabetically on each axis, and specific IDs can be found in the graph with the Highlight tab. ''' for name, m in matrices.items(): self.add_section( name='Vcftools relatedness2', anchor='vcftools_relatedness2', description= "**Input:** `{}`.\n\n Heatmap of `RELATEDNESS_PHI` values from the output of vcftools relatedness2." .format(name), helptext=helptext, plot=heatmap.plot(m.data, xcats=m.x_labels, ycats=m.y_labels, pconfig={ 'square': True, 'decimalPlaces': 7 })) return len(matrices)
def primer_heatmap(self): """ Heatmap showing information on each primer found for every sample """ # Top level dict contains sample IDs + OrderedDict(primer, counts) final_data = list() final_xcats = list() final_ycats = list() for k, v in self.ivar_primers.items(): final_ycats.append(k) tmp_prim_val = list() for prim, val in v.items(): final_xcats.append(prim) tmp_prim_val.append(val) final_data.append(tmp_prim_val) if self.ivar_primers is not None: pconfig = { 'id': 'ivar-primer-count-heatmap', 'decimalPlaces': 0, 'square': False, 'title': 'iVar: Number of primers found for each sample' } self.add_section( name='iVar Primer Counts', anchor='ivar-primers-heatmap', description='Counts observed for each primer per sample.', helptext= 'This lists the number of times a specific primer was found in the respective sample.', plot=heatmap.plot(final_data, final_xcats, final_ycats, pconfig))
def parse_relatedness2(self): matrices = {} for f in self.find_log_files('vcftools/relatedness2', filehandles=True): m = _Relatedness2Matrix(f) if m.data and m.x_labels and m.y_labels: matrices[f['s_name']] = m matrices = self.ignore_samples(matrices) if len(matrices) == 0: return 0 log.info('Found {} valid relatedness2 matrices'.format(len(matrices))) helptext = ''' `RELATEDNESS_PHI` gives a relatedness score between two samples. A higher score indicates a higher degree of relatedness, up to a maximum of 0.5. Samples are sorted alphabetically on each axis, and specific IDs can be found in the graph with the Highlight tab. ''' for name, m in matrices.items(): self.add_section( name = 'Relatedness2', anchor = 'vcftools-relatedness2', description = "**Input:** `{}`.\n\n Heatmap of `RELATEDNESS_PHI` values from the output of vcftools relatedness2.".format(name), helptext = helptext, plot = heatmap.plot( m.data, xcats = m.x_labels, ycats = m.y_labels, pconfig = {'square': True, 'decimalPlaces': 7, 'title': 'VCFTools: Relatedness2'} ) ) return len(matrices)
def roary_heatmap_plot(self, directory): config = { 'id' : "roary_" + directory, 'title': "Roary: " + directory, 'square': False, 'colstops': [ [0, '#FFFFFF'], [1, '#000000'], ], 'legend': False, } return heatmap.plot(self.roary_gene_data[directory], self.roary_gene_samples[directory], self.roary_gene_genes[directory], config)
def status_heatmap(self): """ Heatmap showing all statuses for every sample """ status_numbers = { 'pass': 1, 'warn': 0.5, 'fail': 0.25 } data = [] s_names = [] status_cats = OrderedDict() for s_name in sorted(self.fastqc_data.keys()): s_names.append(s_name) for status_cat, status in self.fastqc_data[s_name]['statuses'].items(): if status_cat not in status_cats: status_cats[status_cat] = status_cat.replace('_', ' ').title().replace('Gc', 'GC') for s_name in s_names: row = [] for status_cat in status_cats: try: row.append(status_numbers[self.fastqc_data[s_name]['statuses'][status_cat]]) except KeyError: row.append(0) data.append(row) pconfig = { 'fastqc-status-heatmap' 'title': 'FastQC: Statuses', 'xTitle': 'Category', 'yTitle': 'Sample', 'min': 0, 'max': 1, 'square': False, 'colstops': [ [0, '#ffffff'], [0.25, '#d9534f'], [0.5, '#fee391'], [1, '#5cb85c'], ], 'decimalPlaces': 1, 'legend': False, 'datalabels': False } self.add_section ( name = 'Statuses', anchor = 'fastqc-statuses', description = 'FastQC section statuses for each sample.', helptext = ''' FastQC assigns a status for each section of the report. Here, we summarise all of these into a single heatmap for a quick overview. Note that not all FastQC sections have plots in MultiQC reports, but all statuses are shown in this heatmap. ''', plot = heatmap.plot(data, list(status_cats.values()), s_names, pconfig) )
def cor_heatmap_plot(self, heatmap_name, heatmap_val): """ Make the HighCharts HTML to plot sample correlation heatmap. """ # Split the data into SE and PE pconfig = { 'title': 'Pearson correlation', 'xlab': True, } self.add_section( description= 'Pearson correlation between log<sub>2</sub> normalised CPM values are calculated and clustered.', plot=heatmap.plot(heatmap_val, heatmap_name, pconfig=pconfig))
def __init__(self, c_id, mod): modname = mod['config'].get('section_name', c_id.replace('_', ' ').title()) # Initialise the parent object super(MultiqcModule, self).__init__(name=modname, anchor=mod['config'].get('section_anchor', c_id), href=mod['config'].get('section_href'), info=mod['config'].get('description')) pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = modname # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) self.intro += table.plot(mod['data'], None, pconfig) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': self.intro += bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) # Line plot elif mod['config'].get('plot_type') == 'linegraph': self.intro += linegraph.plot(mod['data'], pconfig) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': self.intro += scatter.plot(mod['data'], pconfig) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': self.intro += heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': self.intro += beeswarm.plot(mod['data'], pconfig) # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod['config'].get('plot_type'), c_id))
def parse_relatedness2(self): matrices = {} for f in self.find_log_files("vcftools/relatedness2", filehandles=True): m = _Relatedness2Matrix(f) if m.data and m.x_labels and m.y_labels: matrices[f["s_name"]] = m self.add_data_source(f, section="Relatedness") matrices = self.ignore_samples(matrices) if len(matrices) == 0: return 0 log.info("Found {} valid relatedness2 matrices".format(len(matrices))) # The matrices cannot be written to a file in their current format # self.write_data_file(matrices, "vcftools_relatedness") helptext = """ `RELATEDNESS_PHI` gives a relatedness score between two samples. A higher score indicates a higher degree of relatedness, up to a maximum of 0.5. Samples are sorted alphabetically on each axis, and specific IDs can be found in the graph with the Highlight tab. """ idx = 0 for name, m in matrices.items(): idx += 1 self.add_section( name="Relatedness2", anchor="vcftools-relatedness2-{}".format(idx), description= "**Input:** `{}`.\n\n Heatmap of `RELATEDNESS_PHI` values from the output of vcftools relatedness2." .format(name), helptext=helptext, plot=heatmap.plot( m.data, xcats=m.x_labels, ycats=m.y_labels, pconfig={ "id": "vcftools-relatedness2-heatmap-{}".format(idx), "title": "VCFTools: Relatedness2", "square": True, "decimalPlaces": 7, }, ), ) return len(matrices)
def parse_reports(self): #Set up vars self.heatmap_data = dict() #Default search pattern sp = config.sp['deepTools']['Corr'] #Go through files and parse data found_heatmap = False for f in self.find_log_files(sp): # Parse the file xcats = None ycats = [] data = [] for l in f['f'].splitlines(): s = l.split() if xcats is None: xcats = s else: ycats.append(s[0]) data.append([float(x) for x in s[1:]]) #Should only have one heat map per report if found_heatmap: log.warning("Duplicate sample name found! Overwriting: {}".format( f['s_name'])) found_heatmap = True self.add_data_source(f, section='Correlation') if found_heatmap: pconfig = {'title': 'Samples correlation', 'reverseColors': False} self.sections.append({ 'name': 'Samples Similarity', 'anchor': 'deepToolsheatmap', 'content': '<p>This heatmap was generated by ' + '<a href="http://deeptools.readthedocs.io/en/latest/content/tools/plotCorrelation.html" target="_blank">plotCorrelation</a> ' + ' based on the output of ' + '<a href="http://deeptools.readthedocs.io/en/latest/content/tools/multiBamSummary.html" target="_blank">multiBamSummary</a> or' + '<a href="http://deeptools.readthedocs.io/en/latest/content/tools/multiBigwigSummary.html" target="_blank">multiBigwigSummary</a>' + '</p>' + hm.plot(data, xcats, ycats, pconfig) }) #Return number of samples found return 1 if found_heatmap else 0
def somalier_relatedness_heatmap_plot(self): # inspiration: MultiQC/modules/vcftools/relatedness2.py data = [] labels = set() rels = defaultdict(dict) for s_name, d in self.somalier_data.items(): if "relatedness" in d: a, b = s_name.split("*") labels.add(a) labels.add(b) rels[a][b] = rels[b][a] = float(d["relatedness"]) rels[a][a] = rels[b][b] = float(1) # impose alphabetical order and avoid json serialisation errors in utils.report labels = sorted(labels) for x in labels: line = [] for y in labels: try: line.append(rels[x][y]) except KeyError: line.append(-2) data.append(line) if len(data) > 0: pconfig = { "id": "somalier_relatedness_heatmap_plot", "title": "Somalier: Sample Relatedness", "xlab": "Sample A", "ylab": "Sample B", } self.add_section( name="Relatedness Heatmap", anchor="somalier-relatedness-heatmap", description="Heatmap displaying relatedness of sample pairs.", plot=heatmap.plot( data=data, xcats=labels, ycats=labels, pconfig=pconfig, ), )
def abricate_heatmap_plot(self, db): config = { 'id': "abricate_" + db, 'title': "ABRicate: " + db, 'square': False, 'colstops': [ [0, '#FFFFFF'], [0.6, '#ffffe5'], [0.7, '#d9f0a3'], [0.95, '#004529'], [1, '#000000'], ] } return heatmap.plot(self.abricate_data[db], self.abricate_xcats[db], self.abricate_ycats[db], config)
def plot_correlation_heatmap(self): """ Return HTML for correlation heatmap """ data = None corr_type = None correlation_type = getattr(config, 'rna_seqc' ,{}).get('default_correlation', 'spearman') if self.rna_seqc_spearman is not None and correlation_type != 'pearson': data = self.rna_seqc_spearman corr_type = 'Spearman' elif self.rna_seqc_pearson is not None: data = self.rna_seqc_pearson corr_type = 'Pearson' if data is not None: pconfig = { 'id': 'rna_seqc_correlation_heatmap', 'title': 'RNA-SeQC: {} Sample Correlation'.format(corr_type) } self.add_section ( name = '{} Correlation'.format(corr_type), anchor = 'rseqc-rna_seqc_correlation', plot = heatmap.plot(data[1], data[0], data[0], pconfig) )
def plot_correlation_heatmap(self): """Return HTML for correlation heatmap""" data = None corr_type = None correlation_type = getattr(config, "rna_seqc", {}).get("default_correlation", "spearman") if self.rna_seqc_spearman is not None and correlation_type != "pearson": data = self.rna_seqc_spearman corr_type = "Spearman" elif self.rna_seqc_pearson is not None: data = self.rna_seqc_pearson corr_type = "Pearson" if data is not None: pconfig = { "id": "rna_seqc_correlation_heatmap", "title": "RNA-SeQC: {} Sample Correlation".format(corr_type), } self.add_section( name="{} Correlation".format(corr_type), anchor="rseqc-rna_seqc_correlation", plot=heatmap.plot(data[1], data[0], data[0], pconfig), )
def plot_correlation_heatmap(self): """ Return HTML for correlation heatmap """ data = None corr_type = None correlation_type = getattr(config, 'rna_seqc' ,{}).get('default_correlation', 'spearman') if self.rna_seqc_spearman is not None and correlation_type != 'pearson': data = self.rna_seqc_spearman corr_type = 'Spearman' elif self.rna_seqc_pearson is not None: data = self.rna_seqc_pearson corr_type = 'Pearson' if data is not None: pconfig = { 'id': 'rna_seqc_correlation_heatmap', 'title': 'RNA-SeQC: {} Sample Correlation'.format(corr_type) } self.sections.append({ 'name': '{} Correlation'.format(corr_type), 'anchor': 'rseqc-rna_seqc_correlation', 'content': heatmap.plot(data[1], data[0], data[0], pconfig) })
def parse_reports(self): """ Find bamtools stats reports and parse their data """ # Set up vars self.heatmap_data = dict() # Go through files and parse data using regexes found_heatmap = False for f in self.find_log_files('ngi_rnaseq/heatmap'): # Parse the file xcats = None ycats = [] data = [] for l in f['f'].splitlines(): s = l.split() if xcats is None: xcats = s else: ycats.append(s[0]) data.append([float(x) for x in s[1:]]) # Should only have one heat map per report if found_heatmap: log.debug("Duplicate sample name found! Overwriting: {}".format( f['s_name'])) found_heatmap = True self.add_data_source(f, section='heatmap') if found_heatmap: pconfig = {'title': 'Sample Distances', 'reverseColors': True} self.add_section( name='Sample Similarity', anchor='ngi_rnaseq-sample_similarity', description= '''To generate this plot, gene counts are normalised using <a href="https://bioconductor.org/packages/release/bioc/html/edgeR.html" target="_blank">edgeR</a>. Euclidean distances between log<sub>2</sub> normalised CPM values are then calculated and clustered.''', plot=heatmap.plot(data, xcats, ycats, pconfig)) # Return number of samples found return 1 if found_heatmap else 0
def bcbio_qsignature_chart(self, names) : """ Make the bcbio assignment rates plot """ hmdata = list() data = defaultdict(dict) for f in self.find_log_files(names): s_name = self.clean_s_name(f['fn'], root=None) for l in f['f'].splitlines(): cols = l.strip().split() data[cols[0]][cols[1]] = float(cols[2]) data[cols[1]][cols[0]] = float(cols[2]) data[cols[0]][cols[0]] = 0 data[cols[1]][cols[1]] = 0 names = data.keys() for name in names: row = list() for name2 in names: row.append(data[name][name2]) hmdata.append(row) return heatmap.plot(hmdata, names)
def parse_plotCorrelation(self): """Find plotCorrelation output""" self.deeptools_plotCorrelationData = dict() for f in self.find_log_files('deeptools/plotCorrelationData', filehandles=False): parsed_data, samples = self.parsePlotCorrelationData(f) for k, v in parsed_data.items(): if k in self.deeptools_plotCorrelationData: log.warning("Replacing duplicate sample {}.".format(k)) self.deeptools_plotCorrelationData[k] = v if len(parsed_data) > 0: self.add_data_source(f, section='plotCorrelation') self.deeptools_plotCorrelationData = self.ignore_samples( self.deeptools_plotCorrelationData) if len(self.deeptools_plotCorrelationData) > 0: config = { 'id': 'deeptools_correlation_plot', 'title': 'deeptools: Correlation Plot', } data = [] for s_name in samples: try: data.append(self.deeptools_plotCorrelationData[s_name]) except KeyError: pass if len(data) == 0: log.debug('No valid data for correlation plot') return None self.add_section( name="Correlation heatmap", anchor="deeptools_correlation", description= "Pairwise correlations of samples based on distribution of sequence reads", plot=heatmap.plot(data, samples, samples, config)) return len(self.deeptools_plotCorrelationData)
def bcbio_qsignature_chart(self, fnames): """ Make the bcbio assignment rates plot """ hmdata = list() data = defaultdict(dict) for f in self.find_log_files(fnames): s_name = self.clean_s_name(f['fn'], root=None) with open(os.path.join(f['root'], f['fn'])) as in_handle: for l in in_handle: cols = l.strip().split() data[cols[0]][cols[1]] = float(cols[2]) data[cols[1]][cols[0]] = float(cols[2]) data[cols[0]][cols[0]] = 0 data[cols[1]][cols[1]] = 0 if data: names = list(data.keys()) for name in names: row = list() for name2 in names: row.append(data[name][name2]) hmdata.append(row) return heatmap.plot(hmdata, names)
def parse_reports(self): """ Find diff number table and parse their data """ # Set up vars self.diff_num = dict() # Go through files and parse data using regexes for n, f in enumerate( self.find_log_files('diffNum/diff_matrix', filehandles=False, filecontents=False)): if n > 0: raise ValueError('more than one diff number table found.') self.diff_num['table'] = f if len(self.diff_num) == 0: raise UserWarning self.write_data_file(self.diff_num, 'multiqc_diffNum_matrix') diff_num_file = os.path.join(self.diff_num['table']['root'], self.diff_num['table']['fn']) diff_num_df = pd.read_csv(diff_num_file, index_col=0) heatmap_name = list(diff_num_df.columns) heatmap_val = [list(diff_num_df.loc[i]) for i in diff_num_df.index] pconfig = { 'title': 'Differential Expressed Genes', 'xlab': True, } self.add_section( description=('This plot shows the Differential expressed gene number' ' for each compare.'), plot=heatmap.plot(heatmap_val, heatmap_name, pconfig=pconfig)) # Return number of compares found return len(heatmap_val)
def status_heatmap(self): """Heatmap showing all statuses for every sample""" status_numbers = {"pass": 1, "warn": 0.5, "fail": 0.25} data = [] s_names = [] status_cats = OrderedDict() for s_name in sorted(self.fastqc_data.keys()): s_names.append(s_name) for status_cat, status in self.fastqc_data[s_name]["statuses"].items(): if status_cat not in status_cats: status_cats[status_cat] = status_cat.replace("_", " ").title().replace("Gc", "GC") for s_name in s_names: row = [] for status_cat in status_cats: try: row.append(status_numbers[self.fastqc_data[s_name]["statuses"][status_cat]]) except KeyError: row.append(0) data.append(row) pconfig = { "id": "fastqc-status-check-heatmap", "title": "FastQC: Status Checks", "xTitle": "Section Name", "yTitle": "Sample", "min": 0, "max": 1, "square": False, "colstops": [ [0, "#ffffff"], [0.25, "#d9534f"], [0.5, "#fee391"], [1, "#5cb85c"], ], "decimalPlaces": 1, "legend": False, "datalabels": False, "xcats_samples": False, } self.add_section( name="Status Checks", anchor="fastqc_status_checks", description=""" Status for each FastQC section showing whether results seem entirely normal (green), slightly abnormal (orange) or very unusual (red). """, helptext=""" FastQC assigns a status for each section of the report. These give a quick evaluation of whether the results of the analysis seem entirely normal (green), slightly abnormal (orange) or very unusual (red). It is important to stress that although the analysis results appear to give a pass/fail result, these evaluations must be taken in the context of what you expect from your library. A 'normal' sample as far as FastQC is concerned is random and diverse. Some experiments may be expected to produce libraries which are biased in particular ways. You should treat the summary evaluations therefore as pointers to where you should concentrate your attention and understand why your library may not look random and diverse. Specific guidance on how to interpret the output of each module can be found in the relevant report section, or in the [FastQC help](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/). In this heatmap, we summarise all of these into a single heatmap for a quick overview. Note that not all FastQC sections have plots in MultiQC reports, but all status checks are shown in this heatmap. """, plot=heatmap.plot(data, list(status_cats.values()), s_names, pconfig), )
def heatmap(self): """ Generates Heatmap for samples considering all features Computes the Jensen Shannon Divergence between all samples Value 0 corresponds to similar samples and 1 to dissimilar samples Output : Plots heatmap and table showing presence of missing features in samples """ names = [] gc_names = [] seq_names = [] missing_names = {} gc_exists = {} seq_exists = {} for f in self.find_log_files('salmon/fld'): if os.path.basename(f['root']) == 'libParams': s_name = os.path.abspath(f['root']) path = s_name[:-10] sample_name = self.get_sample_name(s_name) if 'no_bias' in s_name: continue path_meta_info = os.path.join(path, 'aux_info', 'meta_info.json') with open(path_meta_info, 'r') as info: meta_info = json.load(info) gc_exists[sample_name] = meta_info['gc_bias_correct'] seq_exists[sample_name] = meta_info['seq_bias_correct'] if gc_exists[sample_name]: gc_names.append(sample_name) if sample_name not in missing_names: missing_names[sample_name] = {} missing_names[sample_name]['Missing GC Feature'] = 'No' else : if sample_name not in missing_names: missing_names[sample_name] = {} missing_names[sample_name]['Missing GC Feature'] = 'Yes' if seq_exists[sample_name]: seq_names.append(sample_name) if sample_name not in missing_names: missing_names[sample_name] = {} missing_names[sample_name]['Missing Seq Feature'] = 'No' else: if sample_name not in missing_names: missing_names[sample_name] = {} missing_names[sample_name]['Missing Seq Feature'] = 'Yes' if gc_exists[sample_name] and seq_exists[sample_name]: names.append(sample_name) sims_gc = [[0 for j in range(len(gc_names))] for i in range(len(gc_names))] sims_3 = [[0 for j in range(len(seq_names))] for i in range(len(seq_names))] sims_5 = [[0 for j in range(len(seq_names))] for i in range(len(seq_names))] sims = [[0 for j in range(len(names))] for i in range(len(names))] for i in range(len(names)): for j in range(len(names)): feature_count = 0 if gc_exists[names[i]] and gc_exists[names[j]]: sims[i][j] += self.jensen_shannon_divergence(self.matrix_gc[names[i]], self.matrix_gc[names[j]]) feature_count += 1.0 for k in range(len(self.nucleotides)): if seq_exists[names[i]] and seq_exists[names[j]]: sims[i][j] += self.jensen_shannon_divergence(self.matrix_seq3[k][names[i]], self.matrix_seq3[k][names[j]]) sims[i][j] += self.jensen_shannon_divergence(self.matrix_seq5[k][names[i]], self.matrix_seq5[k][names[j]]) feature_count += 2.0 sims[i][j] /= feature_count for i in range(len(gc_names)): for j in range(len(gc_names)): if gc_exists[gc_names[i]] and gc_exists[gc_names[j]]: sims_gc[i][j] += self.jensen_shannon_divergence(self.matrix_gc[gc_names[i]], self.matrix_gc[gc_names[j]]) for i in range(len(seq_names)): for j in range(len(seq_names)): for k in range(len(self.nucleotides)): if seq_exists[seq_names[i]] and seq_exists[seq_names[j]]: sims_3[i][j] += self.jensen_shannon_divergence(self.matrix_seq3[k][seq_names[i]], self.matrix_seq3[k][seq_names[j]]) sims_5[i][j] += self.jensen_shannon_divergence(self.matrix_seq5[k][seq_names[i]], self.matrix_seq5[k][seq_names[j]]) sims_3[i][j] /= (1.0*len(self.nucleotides)) sims_5[i][j] /= (1.0*len(self.nucleotides)) pconfig_sim = { 'title': 'Sample similarity (JSD)', 'xTitle': 'Samples', 'yTitle': 'Samples', } pconfig_sim_gc = { 'title': 'Feature GC Sample similarity (JSD)', 'xTitle': 'Samples', 'yTitle': 'Samples', } pconfig_sim_3 = { 'title': 'Feature Seq 3 Sample similarity (JSD)', 'xTitle': 'Samples', 'yTitle': 'Samples', } pconfig_sim_5 = { 'title': 'Feature Seq 5 Sample similarity (JSD)', 'xTitle': 'Samples', 'yTitle': 'Samples', } if len(gc_exists) > 0: self.add_section(plot = heatmap.plot(sims_gc, gc_names, pconfig=pconfig_sim_gc)) if len(seq_exists) > 0: self.add_section(plot = heatmap.plot(sims_3, seq_names, pconfig=pconfig_sim_3)) if len(seq_exists) > 0: self.add_section(plot = heatmap.plot(sims_5, seq_names, pconfig=pconfig_sim_5)) if len(names) > 0: self.add_section(plot = heatmap.plot(sims, names, pconfig=pconfig_sim)) self.add_section(plot = table.plot(missing_names))
def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name='Salmon', anchor='salmon', href='http://combine-lab.github.io/salmon/', info= "is a tool for quantifying the expression of transcripts using RNA-seq data." ) # Parse meta information. JSON win! self.salmon_meta = dict() # Declaring dicts to hold ratios for first,midddle,last rows with weights and avergage ratio for GC Bias self.salmon_bias_FirstSampleWeights = dict() self.salmon_bias_MiddleSampleWeights = dict() self.salmon_bias_LastSampleWights = dict() self.salmon_bias_Average = dict() self.salmon_bias_TotalAverage = dict() #Declaring dicts to hold sequence 3' and 5' marginalized ratio for all bases i.e A,C,G,T and the average bias for 3' and 5' self.salmon_seq3A = dict() self.salmon_seq3C = dict() self.salmon_seq3G = dict() self.salmon_seq3T = dict() self.salmon_seq5A = dict() self.salmon_seq5C = dict() self.salmon_seq5G = dict() self.salmon_seq5T = dict() self.salmon_seq3Average = dict() self.salmon_seq5Average = dict() #Declaring dict to hold the ratios of Effective v/s Actual length of samples from quant.sf file self.salmon_quant = dict() # Declaring lists to hold arrays for every sample used in Heatmaps self.heatmapFirstrow = [] self.heatMapMiddleRow = [] self.heatMapLastRow = [] self.averageBiasHeatMap = [] self.salmon_seq3HeatMap = [] self.salmon_seq5HeatMap = [] # List of all the sample names self.sample_names = [] count = 0 for f in self.find_log_files('salmon/meta'): # Get the s_name from the parent directory s_name = os.path.basename(os.path.dirname(f['root'])) s_name = self.clean_s_name(s_name, f['root']) self.salmon_meta[s_name] = json.loads(f['f']) s_name_trimmed = s_name.partition('|')[0].split() self.sample_names.append(s_name_trimmed) # Check if folder contains GC bias files gcBias = checkJSONForBias(os.path.dirname(f['root']), 'gcBias') if gcBias: # Dicts for every sample for all the bucket(25) ratios to hold (x,y) data for linegraphs firstRatioWeight = OrderedDict() middleRatioWeight = OrderedDict() lastRatioWeight = OrderedDict() average = OrderedDict() sampleAverage = OrderedDict() gc = GCModel() # Instantiate GCModel class # Call the GCModel method to get all observed and expected values gc.from_file(os.path.dirname(f['root'])) first_Row = (gc.obs_[0] / gc.exp_[0]) * (gc.obs_weights_[0] / gc.exp_weights_[0]) middle_Row = (gc.obs_[1] / gc.exp_[1]) * (gc.obs_weights_[1] / gc.exp_weights_[1]) last_Row = (gc.obs_[2] / gc.exp_[2]) * (gc.obs_weights_[2] / gc.exp_weights_[2]) # Avergaing all the ratios for the entire sample totalSampleAverage = ( (sum(first_Row) + sum(middle_Row) + sum(last_Row)) / (len(first_Row) + len(middle_Row) + len(last_Row))) sampleAverage[count] = totalSampleAverage count = count + 1 self.salmon_bias_TotalAverage[ s_name_trimmed[0]] = sampleAverage #Avergaing ratios for each row used in Heatmap for every row self.heatmapFirstrow.append(first_Row.tolist()) self.heatMapMiddleRow.append(middle_Row.tolist()) self.heatMapLastRow.append(last_Row.tolist()) heatmapAverage = [] # Iterating over all the buckets to create Ordered Dicts for i in range(len(first_Row)): index = i * (100 / len(first_Row)) firstRatioWeight[index] = first_Row[i] middleRatioWeight[index] = middle_Row[i] lastRatioWeight[index] = last_Row[i] average[index] = np.mean( [first_Row[i], middle_Row[i], last_Row[i]]) heatmapAverage.append(average[index]) # Setting all the ordered dicts to the outermost Dictionaries with sample name as keys self.salmon_bias_FirstSampleWeights[s_name] = firstRatioWeight self.salmon_bias_MiddleSampleWeights[ s_name] = middleRatioWeight self.salmon_bias_LastSampleWights[s_name] = lastRatioWeight self.salmon_bias_Average[s_name] = average self.averageBiasHeatMap.append(heatmapAverage) # Check if folder contains sequence bias files seqBias = checkJSONForBias(os.path.dirname(f['root']), 'seqBias') if seqBias: # Dicts for every base for 3' and 5' sequence, average 3' and average 5' and quant dict seq3A = OrderedDict() seq5A = OrderedDict() seq3C = OrderedDict() seq5C = OrderedDict() seq3G = OrderedDict() seq5G = OrderedDict() seq3T = OrderedDict() seq5T = OrderedDict() seq3_Average = OrderedDict() seq5_Average = OrderedDict() quant_Dict = OrderedDict() # Calculate the ratio of all rows for observed by expected seq = SeqModel() # Instantiate SeqModel class # Call the SeqModel method to get all observed and expected ratios seq.from_file(os.path.dirname(f['root'])) seq3A_prob = seq.obs3_[0] / seq.exp3_[0] seq3C_prob = seq.obs3_[1] / seq.exp3_[1] seq3G_prob = seq.obs3_[2] / seq.exp3_[2] seq3T_prob = seq.obs3_[3] / seq.exp3_[3] seq5A_prob = seq.obs5_[0] / seq.exp5_[0] seq5C_prob = seq.obs5_[1] / seq.exp5_[1] seq5G_prob = seq.obs5_[2] / seq.exp5_[2] seq5T_prob = seq.obs5_[3] / seq.exp5_[3] seq3_HeatMap = [] seq5_HeatMap = [] # Iterate over the contect length to create all Orderede Dictonaries of (x,y) values for linegraph and list for Heatmap for i in range(len(seq3A_prob)): index = i * (100 / len(seq3A_prob)) seq3A[index] = seq3A_prob[i] seq5A[index] = seq5A_prob[i] seq3C[index] = seq3C_prob[i] seq5C[index] = seq5C_prob[i] seq3G[index] = seq3G_prob[i] seq5G[index] = seq5G_prob[i] seq3T[index] = seq3T_prob[i] seq5T[index] = seq5T_prob[i] seq3_Average[index] = np.mean([ seq3A_prob[i], seq3C_prob[i], seq3G_prob[i], seq3T_prob[i] ]) seq5_Average[index] = np.mean([ seq5A_prob[i], seq5C_prob[i], seq5G_prob[i], seq5T_prob[i] ]) seq3_HeatMap.append(seq3_Average[index]) seq5_HeatMap.append(seq5_Average[index]) # Setting all the ordered dicts to the outermost Dictionaries with sample name as keys self.salmon_seq3A[s_name] = seq3A self.salmon_seq5A[s_name] = seq5A self.salmon_seq3C[s_name] = seq3C self.salmon_seq5C[s_name] = seq5C self.salmon_seq3G[s_name] = seq3G self.salmon_seq5G[s_name] = seq5G self.salmon_seq3T[s_name] = seq3T self.salmon_seq5T[s_name] = seq5T self.salmon_seq3Average[s_name] = seq3_Average self.salmon_seq5Average[s_name] = seq5_Average self.salmon_seq3HeatMap.append(seq3_HeatMap) self.salmon_seq5HeatMap.append(seq5_HeatMap) # Call Quant model which reads the quant.sf file and returns ratio of Effective/Actual length quant = QuantModel() quant.from_file(os.path.dirname(f['root'])) quant_ratio = quant.ratio for i in range(len(quant_ratio)): quant_Dict[i] = quant_ratio[i] self.salmon_quant[s_name] = quant_Dict # Parse Fragment Length Distribution logs self.salmon_fld = dict() for f in self.find_log_files('salmon/fld'): # Get the s_name from the parent directory if os.path.basename(f['root']) == 'libParams': s_name = os.path.basename(os.path.dirname(f['root'])) s_name = self.clean_s_name(s_name, f['root']) parsed = OrderedDict() for i, v in enumerate(f['f'].split()): parsed[i] = float(v) if len(parsed) > 0: if s_name in self.salmon_fld: log.debug( "Duplicate sample name found! Overwriting: {}". format(s_name)) self.add_data_source(f, s_name) self.salmon_fld[s_name] = parsed # Parse Fragment Length Distribution logs # Filter to strip out ignored sample names self.salmon_meta = self.ignore_samples(self.salmon_meta) self.salmon_fld = self.ignore_samples(self.salmon_fld) if len(self.salmon_meta) == 0 and len(self.salmon_fld) == 0: raise UserWarning if len(self.salmon_meta) > 0: log.info("Found {} meta reports".format(len(self.salmon_meta))) self.write_data_file(self.salmon_meta, 'multiqc_salmon') if len(self.salmon_fld) > 0: log.info("Found {} fragment length distributions".format( len(self.salmon_fld))) if len(self.salmon_bias_Average) > 0: log.info("Found {} GC Bias".format(len(self.salmon_bias_Average))) if len(self.salmon_seq3Average) > 0: log.info("Found {} Sequence 3' bias".format( len(self.salmon_seq3Average))) if len(self.salmon_seq5Average) > 0: log.info("Found {} Sequence 5' bias".format( len(self.salmon_seq5Average))) # Add alignment rate to the general stats table headers = OrderedDict() headers['percent_mapped'] = { 'title': '% Aligned', 'description': '% Mapped reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'YlGn' } headers['num_mapped'] = { 'title': 'M Aligned', 'description': 'Mapped reads (millions)', 'min': 0, 'scale': 'PuRd', 'modify': lambda x: float(x) / 1000000, 'shared_key': 'read_count' } self.general_stats_addcols(self.salmon_meta, headers) # Fragment length distribution plot pconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Fragment Length Distribution', 'ylab': 'Fraction', 'xlab': 'Fragment Length (bp)', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(self.salmon_fld, pconfig)) # GC Bias First Row plot pconfig_GCBias_Begin = { 'smooth_points': 500, 'title': 'Salmon : GC Bias Ratio in Beginning of Read', 'ylab': 'Ratio', 'xlab': 'GC Biases', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='GC Bias First Row', plot=linegraph.plot( self.salmon_bias_FirstSampleWeights, pconfig_GCBias_Begin)) # GC Bias Middle row plot pconfig_GCBias_Middle = { 'smooth_points': 500, 'title': 'Salmon : GC Bias Ratio in Middle of Read', 'ylab': 'Ratio', 'xlab': 'GC Biases', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='GC Bias Middle Row', plot=linegraph.plot( self.salmon_bias_MiddleSampleWeights, pconfig_GCBias_Middle)) # GC Bias Last row plot pconfig_GCBias_Last = { 'smooth_points': 500, 'id': 'salmon_plot6', 'title': 'Salmon : GC Bias Ratio in Last of Read', 'ylab': 'Ratio', 'xlab': 'GC Biases', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='GC Bias Last Row', plot=linegraph.plot(self.salmon_bias_LastSampleWights, pconfig_GCBias_Last)) # GC Bias Average across all samples pconfig_GCBias_Average = { 'smooth_points': 500, 'title': 'Salmon : Average GC Bias of all samples', 'ylab': 'Ratio', 'xlab': 'Bias', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='GC Bias Average', plot=linegraph.plot(self.salmon_bias_Average, pconfig_GCBias_Average)) # GC Bias Average bar plot pconfig_GCBias_Bar = { 'smooth_points': 500, 'title': 'Salmon : Average GC Bias bar plot', 'ylab': 'Ratios', 'xlab': 'Samples', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='GC Bias Average Bar Plot', plot=bargraph.plot(self.salmon_bias_TotalAverage, pconfig=pconfig_GCBias_Bar)) # Sequence 3' Bias for A pconfig_Seq3_A = { 'smooth_points': 500, 'title': 'Salmon : Seq 3 A Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 3 A-Base', plot=linegraph.plot(self.salmon_seq3A, pconfig_Seq3_A)) # Sequence 3' Bias for C pconfig_Seq3_C = { 'smooth_points': 500, 'title': 'Salmon : Seq 3 C Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 3 C-Base', plot=linegraph.plot(self.salmon_seq3C, pconfig_Seq3_C)) # Sequence 3' Bias for G pconfig_Seq3_G = { 'smooth_points': 500, 'title': 'Salmon : Seq 3 G Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 3 G-Base', plot=linegraph.plot(self.salmon_seq3G, pconfig_Seq3_G)) # Sequence 3' Bias for T pconfig_Seq3_T = { 'smooth_points': 500, 'title': 'Salmon : Seq 3 T base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 3 T-Base', plot=linegraph.plot(self.salmon_seq3T, pconfig_Seq3_T)) # Sequence 3' Average pconfig_Seq3_Avg = { 'smooth_points': 500, 'title': 'Salmon : Seq 3 Average', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 3 Average', plot=linegraph.plot(self.salmon_seq3Average, pconfig_Seq3_Avg)) # Sequence 5' Bias for A pconfig_Seq5_A = { 'smooth_points': 500, 'title': 'Salmon : Seq 5 A Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 5 A-Base', plot=linegraph.plot(self.salmon_seq5A, pconfig_Seq5_A)) # Sequence 5' Bias for C pconfig_Seq5_C = { 'smooth_points': 500, 'title': 'Salmon : Seq 5 C Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 5 C-Base', plot=linegraph.plot(self.salmon_seq5C, pconfig_Seq5_C)) # Sequence 5' Bias for G pconfig_Seq5_G = { 'smooth_points': 500, 'title': 'Salmon : Seq 5 G Base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 5 G-Base', plot=linegraph.plot(self.salmon_seq5G, pconfig_Seq5_G)) # Sequence 5' Bias for T pconfig_Seq5_T = { 'smooth_points': 500, 'title': 'Salmon : Seq 5 T base', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 5 T-Base', plot=linegraph.plot(self.salmon_seq5T, pconfig_Seq5_T)) # Sequence 5' Average pconfig_Seq5_Avg = { 'smooth_points': 500, 'title': 'Salmon : Seq 5 Average', 'ylab': 'Marginalized Probability Ratio', 'xlab': 'Sequence', 'ymin': 0, 'xmin': 0, 'xmax': 100, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Sequence 5 Average', plot=linegraph.plot(self.salmon_seq5Average, pconfig_Seq5_Avg)) # Quant Plot pconfig_Quant = { 'smooth_points': 500, 'id': 'salmon_plot7', 'title': 'Salmon : Quant plot', 'ylab': 'Effective/Actual Length ', 'xlab': 'Samples', 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(name='Quant Plot', plot=linegraph.plot(self.salmon_quant, pconfig_Quant)) # First Row of all samples Heatmap FirstRowCoff = np.corrcoef(self.heatmapFirstrow) self.add_section( name='GC Bias First Row Heatmap', description= 'Heatmap to display variance between first row ratios of all the samples', plot=heatmap.plot(FirstRowCoff, self.sample_names, self.sample_names)) # Middle Row of all samples Heatmap MiddleRowCoff = np.corrcoef(self.heatMapMiddleRow) self.add_section( name='GC Bias Middle Row Heatmap', description= 'Heatmap to display variance between middle row ratios of all the samples', plot=heatmap.plot(MiddleRowCoff, self.sample_names, self.sample_names)) # Last Row of all samples Heatmap LastRowCoff = np.corrcoef(self.heatMapLastRow) self.add_section( name='GC Bias Last Row Heatmap', description= 'Heatmap to display variance between last row ratios of all the samples', plot=heatmap.plot(LastRowCoff, self.sample_names, self.sample_names)) # GC Bias HeatMap AverageCoff = np.corrcoef(self.averageBiasHeatMap) self.add_section( name='GC Bias Heatmap', description='Heatmap to display average bias across all samples', plot=heatmap.plot(AverageCoff, self.sample_names, self.sample_names)) # Seq 3' Heatmap Seq3HeatMap = np.corrcoef(self.salmon_seq3HeatMap) self.add_section( name='Sequence 3 Heatmap', description= 'Heatmap to display Sequence 3 prime across all samples', plot=heatmap.plot(Seq3HeatMap, self.sample_names, self.sample_names)) # Seq 5' Heatmap Seq5HeatMap = np.corrcoef(self.salmon_seq5HeatMap) self.add_section( name='Sequence 5 Heatmap', description= 'Heatmap to display Sequence 5 prime across all samples', plot=heatmap.plot(Seq5HeatMap, self.sample_names, self.sample_names))
def hops_heatmap(self): """Heatmap showing all statuses for every sample""" heatmap_numbers = {"none": 1, "edit_only": 2, "damage_only": 3, "edit_and_damage": 4} samples = [] for s in self.hops_data: samples.append(s) # As all samples always have same taxa, will take from the first sample taxa = [] for t in self.hops_data[samples[0]]: taxa.append(t.replace("_", " ")) # Get values from named list into a list of lists required for heatmap levels = [] for s in samples: levels.append(self.hops_data[s].values()) pconfig = { "id": "hops-heatmap", "title": "HOPS: Potential Candidates", "xTitle": "Node", "yTitle": "Sample", "min": 0, "max": 1, "square": False, "colstops": [ [1, "#ededed"], [2, "#FFFFC5"], [3, "#F2B26C"], [4, "#AD2A2B"], ], "decimalPlaces": 0, "legend": False, "datalabels": False, "xcats_samples": False, } extra_warning = "" if len(self.hops_data) > 20: extra_warning = """ <div class="alert alert-warning"> Large numbers of samples can result in Y-axis labels overlapping. Drag the handle at the bottom of the plot down to expand and see all samples names. </div> """ self.add_section( name="Potential Candidates", anchor="hops_heatmap", description=""" Heatmap of candidate taxa for downstream aDNA analysis, with intensity representing additive categories of possible 'positive' hits. """ + extra_warning, helptext=""" HOPS assigns a category based on how many ancient DNA characteristics a given node (i.e. taxon) in a sample has. The colours indicate the following: * <span style="background-color: #ededed; padding:0.2rem 1rem;">**Grey**</span> - No characteristics detected * <span style="background-color: #FFFFC5; padding:0.2rem 1rem;">**Yellow**</span> - Small edit distance from reference * <span style="background-color: #F2B26C; padding:0.2rem 1rem;">**Orange**</span> - Typical aDNA damage pattern * <span style="background-color: #AD2a2B; padding:0.2rem 1rem;">**Red**</span> - Small edit distance _and_ aDNA damage pattern A red category typically indicates a good candidate for further investigation in downstream analysis. """, plot=heatmap.plot(levels, xcats=taxa, ycats=samples, pconfig=pconfig), )
def add_cc_section(self, c_id, mod): section_name = mod['config'].get('section_name', c_id.replace('_', ' ').title()) if section_name == '' or section_name is None: section_name = 'Custom Content' section_description = mod['config'].get('description', '') pconfig = mod['config'].get('pconfig', {}) if pconfig.get('title') is None: pconfig['title'] = section_name plot = None content = None # Table if mod['config'].get('plot_type') == 'table': pconfig['sortRows'] = pconfig.get('sortRows', False) headers = mod['config'].get('headers') plot = table.plot(mod['data'], headers, pconfig) self.write_data_file( mod['data'], "multiqc_{}".format(section_name.lower().replace(' ', '_'))) # Bar plot elif mod['config'].get('plot_type') == 'bargraph': plot = bargraph.plot(mod['data'], mod['config'].get('categories'), pconfig) # Line plot elif mod['config'].get('plot_type') == 'linegraph': plot = linegraph.plot(mod['data'], pconfig) # Scatter plot elif mod['config'].get('plot_type') == 'scatter': plot = scatter.plot(mod['data'], pconfig) # Heatmap elif mod['config'].get('plot_type') == 'heatmap': plot = heatmap.plot(mod['data'], mod['config'].get('xcats'), mod['config'].get('ycats'), pconfig) # Beeswarm plot elif mod['config'].get('plot_type') == 'beeswarm': plot = beeswarm.plot(mod['data'], pconfig) # Raw HTML elif mod['config'].get('plot_type') == 'html': content = mod['data'] # Raw image file as html elif mod['config'].get('plot_type') == 'image': content = mod['data'] # Not supplied elif mod['config'].get('plot_type') == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod['config'].get('plot_type'), c_id)) # Don't use exactly the same title / description text as the main module if section_name == self.name: section_name = None if section_description == self.info: section_description = '' self.add_section(name=section_name, anchor=c_id, description=section_description, plot=plot, content=content)
def add_cc_section(self, c_id, mod): section_name = mod["config"].get("section_name", c_id.replace("_", " ").title()) if section_name == "" or section_name is None: section_name = "Custom Content" section_description = mod["config"].get("description", "") pconfig = mod["config"].get("pconfig", {}) if pconfig.get("title") is None: pconfig["title"] = section_name plot = None content = None # Table if mod["config"].get("plot_type") == "table": pconfig["sortRows"] = pconfig.get("sortRows", False) headers = mod["config"].get("headers") plot = table.plot(mod["data"], headers, pconfig) self.write_data_file( mod["data"], "multiqc_{}".format(section_name.lower().replace(" ", "_"))) # Bar plot elif mod["config"].get("plot_type") == "bargraph": plot = bargraph.plot(mod["data"], mod["config"].get("categories"), pconfig) # Line plot elif mod["config"].get("plot_type") == "linegraph": plot = linegraph.plot(mod["data"], pconfig) # Scatter plot elif mod["config"].get("plot_type") == "scatter": plot = scatter.plot(mod["data"], pconfig) # Heatmap elif mod["config"].get("plot_type") == "heatmap": plot = heatmap.plot(mod["data"], mod["config"].get("xcats"), mod["config"].get("ycats"), pconfig) # Beeswarm plot elif mod["config"].get("plot_type") == "beeswarm": plot = beeswarm.plot(mod["data"], pconfig) # Raw HTML elif mod["config"].get("plot_type") == "html": content = mod["data"] # Raw image file as html elif mod["config"].get("plot_type") == "image": content = mod["data"] # Not supplied elif mod["config"].get("plot_type") == None: log.warning("Plot type not found for content ID '{}'".format(c_id)) # Not recognised else: log.warning( "Error - custom content plot type '{}' not recognised for content ID {}" .format(mod["config"].get("plot_type"), c_id)) # Don't use exactly the same title / description text as the main module if section_name == self.name: section_name = None if section_description == self.info: section_description = "" self.add_section(name=section_name, anchor=c_id, description=section_description, plot=plot, content=content)
def hops_heatmap(self): """ Heatmap showing all statuses for every sample """ heatmap_numbers = { 'none': 1, 'edit_only': 2, 'damage_only': 3, 'edit_and_damage': 4 } samples = [] for s in self.hops_data: samples.append(s) # As all samples always have same taxa, will take from the first sample taxa = [] for t in self.hops_data[samples[0]]: taxa.append(t.replace('_', ' ')) # Get values from named list into a list of lists required for heatmap levels = [] for s in samples: levels.append(self.hops_data[s].values()) pconfig = { 'id': 'hops-heatmap', 'title': 'HOPS: Potential Candidates', 'xTitle': 'Node', 'yTitle': 'Sample', 'min': 0, 'max': 1, 'square': False, 'colstops': [ [1, '#ededed'], [2, '#FFFFC5'], [3, '#F2B26C'], [4, '#AD2A2B'], ], 'decimalPlaces': 0, 'legend': False, 'datalabels': False, 'xcats_samples': False, } extra_warning = '' if len(self.hops_data) > 20: extra_warning = ''' <div class="alert alert-warning"> Large numbers of samples can result in Y-axis labels overlapping. Drag the handle at the bottom of the plot down to expand and see all samples names. </div> ''' self.add_section(name='Potential Candidates', anchor='hops_heatmap', description=''' Heatmap of candidate taxa for downstream aDNA analysis, with intensity representing additive categories of possible 'positive' hits. ''' + extra_warning, helptext=''' HOPS assigns a category based on how many ancient DNA characteristics a given node (i.e. taxon) in a sample has. The colours indicate the following: * <span style="background-color: #ededed; padding:0.2rem 1rem;">**Grey**</span> - No characteristics detected * <span style="background-color: #FFFFC5; padding:0.2rem 1rem;">**Yellow**</span> - Small edit distance from reference * <span style="background-color: #F2B26C; padding:0.2rem 1rem;">**Orange**</span> - Typical aDNA damage pattern * <span style="background-color: #AD2a2B; padding:0.2rem 1rem;">**Red**</span> - Small edit distance _and_ aDNA damage pattern A red category typically indicates a good candidate for further investigation in downstream analysis. ''', plot=heatmap.plot(levels, xcats=taxa, ycats=samples, pconfig=pconfig))
def ssds_heatmap(self): ## PLOT 3: Heatmap showing SPoT breakdown by type for every sample data = [] spot_vals = OrderedDict() hm = self.SPoT_values # dna_types = self.SPoT_values.keys() dna_types = [ "ssDNA", "ssDNA_type2", "dsDNA_hiconf", "dsDNA_loconf", "unclassified" ] short_dna_type = OrderedDict() short_dna_type["ssDNA"] = "ss" short_dna_type["ssDNA_type2"] = "t2" short_dna_type["dsDNA_hiconf"] = "dH" short_dna_type["dsDNA_loconf"] = "dL" short_dna_type["unclassified"] = "un" sample_names = sorted(self.SPoT_values["ssDNA"].keys()) interval_names = [] for s in sample_names: for k in sorted(self.SPoT_values["ssDNA"][s]): if k not in interval_names: interval_names.append(k) s_names = [] for d in dna_types: for s in sample_names: s_names.append("(" + short_dna_type[d] + ")" + s) for i in interval_names: row = [] for d in dna_types: for s in sample_names: try: row.append(float(self.SPoT_values[d][s][i])) except KeyError: row.append(0) data.append(row) pconfig = { "id": "ssds-spot-heatmap", "title": "SSDS: Signal Percentage of Tags (%)", "xTitle": "", "yTitle": "Interval", "square": False, "colstops": [ [0, "#ffffff"], [0.001, "#fefce9"], [0.50, "#ffc265"], [1.00, "#ff6262"], ], "decimalPlaces": 0, "legend": False, "datalabels": True, "xcats_samples": False, "ycats_samples": False, "borderWidth": 1, } self.add_section( name="SSDS SPoTs", anchor="ssds_spot_heatmap", description=""" Signal Percentage of Tags (SPoT) for all samples (%). Colors indicate the value (0 / no data =white; Otherwise, increasing SPoT from yellow to orange to red). Intervals annotated as (R) represent the SPoT when the intervals are randomly shuffled in the genome (bedtools shuffle -chrom). This provides a naive, but useful estimate of random expectation for a non-enriched library. """, helptext=""" The Signal Percentage of Tags (SPoT) represents the percentage of sequencing reads found in a set of genomic intervals. Higher numbers indicate that the library was enriched for reads in that location. The SSDS report may also contain intervals annotated as (R); these represent the SPoT when the intervals are randomly shuffled in the genome (bedtools shuffle -chrom). This represents a reasonable expectation of random overlap, however this very simple estimate should be formally validated more robustly. """, plot=heatmap.plot(data, s_names, interval_names, pconfig), )
def top_five_duplication_heatmap(self): """Add a heatmap showing the minimizer duplication top-5 species""" duplication = list() pconfig = { "id": "kraken-topfive-duplication_plot", "title": f"Kraken 2: Top {self.top_n} species duplication" } rank_code = "S" rank_data = dict() # Loop through the summed tax percentages to get the top 5 across all samples try: sorted_pct = sorted(self.kraken_total_pct[rank_code].items(), key=lambda x: x[1], reverse=True) except KeyError: pass # Taxa rank not found in this sample i = 0 counts_shown = {} showed_warning = False for classif, pct_sum in sorted_pct: i += 1 if i > self.top_n: break # Pull out counts for this rank + classif from each sample for s_name, d in self.kraken_raw_data.items(): if s_name not in rank_data: rank_data[s_name] = dict() if s_name not in counts_shown: counts_shown[s_name] = 0 for row in d: if row["rank_code"] == rank_code: if row["classif"] == classif: if classif not in rank_data[s_name]: rank_data[s_name][classif] = 0 try: rank_data[s_name][classif] = row[ "minimizer_duplication"] except KeyError: del rank_data[s_name] if not showed_warning: log.warning( "Kraken2 reports of different versions were found" ) showed_warning = True # Strip empty samples for sample, vals in dict(rank_data).items(): if len(vals) == 0: del rank_data[sample] # Build data structures for heatmap ylabels = list(rank_data.keys()) xlabels = list(rank_data[ylabels[0]].keys()) for sample in rank_data: duplication.append(list(rank_data[sample].values())) self.add_section( name="Duplication rate of top species", anchor="kraken-duplication-topfive", description= f"The duplication rate of minimizer falling into the top {self.top_n} species", helptext=f""" To make this plot, the minimizer duplication rate is computed for the top {self.top_n} most abundant species in all samples. The minimizer duplication rate is defined as: `duplication rate = (total number of minimizers / number of distinct minimizers)` A low coverage and high duplication rate (`>> 1`) is often sign of read stacking, which probably indicates of false positive hit. """, plot=heatmap.plot(duplication, xlabels, ylabels, pconfig), )
def heatmap(self, json, index): # config dictionary for heatmaps heat_pconfig = { "id": "htstream_primers_bargraph_" + index, "title": "HTStream: Primers Heatmap", "square": False, "datalabels": False, "xcats_samples": False, "ycats_samples": False, "colstops": [[0, "#FFFFFF"], [1, "#1DC802"]], } # Button and unique ids unique_id = str(random() % 1000)[5:] first = True button_list = [] for key in json.keys(): # creates unique heatmap id that can be queired later by js. heat_pconfig[ "id"] = "htstream_primers_" + key + "_" + unique_id + "_heatmap" data = [] labs = [] counts_list = json[key]["Pr_Primer_Counts" + index] # get counts and labels for x in range(len(counts_list)): temp = counts_list[x] labs += temp[:-1] # remove label dups labs = list(set(labs)) # Create multidimensional list data = [[0] * len(labs) for i in range(len(labs))] # Appropriately fill list for primer combos for x in range(len(counts_list)): x_pos = labs.index(counts_list[x][0]) y_pos = labs.index(counts_list[x][1]) data[x_pos][y_pos] = counts_list[x][-1] data[y_pos][x_pos] = counts_list[x][-1] # if this is the first sample process, lucky them, they get to be shown first and marked as active. # This step is necessary otherwise, the plot div is not initialized. The additional calls to the # heatmap function are simply to add the data to the internal jsons used by MultiQC if first == True: active = "active" # button is default active first = False # shuts off first gate heatmap_html = heatmap.plot(data, labs, labs, heat_pconfig) else: active = "" # button is default off heatmap.plot(data, labs, labs, heat_pconfig) # html div attributes and text name = key pid = heat_pconfig["id"] + "_btn" button_list.append( '<button class="btn btn-default btn-sm {a}" onclick="htstream_div_switch(this)" id="{pid}">{n}</button>\n' .format(a=active, pid=pid, n=name)) # Create html for multiple heatmaps heatmap_plot = htstream_utils.multi_heatmap_html( button_list, heatmap_html) wrapper_html = "<h4> Primers: Primer Counts </h4>" wrapper_html += """<p>Heatmap indicating abundance of primer combinations.</p>""" # Heatmaps wrapper_html += """<div class="mqc_hcplot_plotgroup">""" wrapper_html += '<div id="htstream_heat_primers_{u}" class="htstream_fadein">'.format( u=unique_id) wrapper_html += heatmap_plot + "</div></div>" final_html = wrapper_html return wrapper_html
def quality_by_cycle(self, json, read): # Here is the most complicated figure implementation in this whole module. # The issues here are that MultiQC had limited options for displaying # multiple figures if its a heatmap. Also, it doesnt allow you to switch # back and forth between figure typs. There are workarounds, however, using # javascript and some clever organizations of javascript. title_read = " ".join(read.split("_")[1:3]) # config dictionary for mean Q score line graph line_config = { 'smooth_points_sumcounts': False, 'categories': True, 'title': "HTStream: Mean Quality by Cycle (" + title_read + ")", 'xlab': "Cycle", 'ylab': "Mean Q Score", } # config dictionary for heatmaps heat_pconfig = {'id' : "", 'title': "HTStream: Quality by Cycle (" + title_read + ")", 'yTitle': 'Q Score', 'xTitle': 'Cycle', 'square' : False, 'datalabels': False, 'max': 1.0, 'colstops': [ [0, '#FFFFFF'], [0.3, '#1DC802'], [0.6, '#F3F943'], [1, '#E70808'] ] } btn_id = "-".join(read.split("_")[:3]).lower() line_data = {} status_dict = {} first = True button_list = [] for key in json.keys(): # create dictionary for line graph. Again, format is {x: y} line_data[key] = {} # creates unique heatmap id that can be queired later by js. heat_pconfig["id"] = "htstream_" + btn_id + "_" + key + "_heatmap" # creates x and y axis labels for heatmap (categorical) x_lab = json[key][read]["col_names"] y_lab = json[key][read]["row_names"][::-1] # reverse orientation makes it easier to cycle through data = [] # create variables for range functions in loops. Represents shape of data quality_scores = json[key][read]["shape"][0] cycles = json[key][read]["shape"][-1] # temp total list total = [] # iterates through positions, creates a list of the sum of scores at each position to be used # to calculated frequency for heatmap. Also, calculates avg. Q score for linegraph. # This chunk of code is very ugly, but is a necessary evil. num_above_q30 = 0 for pos in range(cycles): temp = [ score_list[pos] for score_list in json[key][read]["data"] ] temp_sum = sum(temp) total.append(temp_sum) # multiples count at poistion by Q Score. total_score = sum([(int(p) * int(s)) for p, s in zip(temp, y_lab[::-1])]) # divides sum of total score by the number of cycles for avg fragments line_data[key][pos] = total_score / temp_sum # total reads if line_data[key][pos] > 30: num_above_q30 += 1 # check to see what percent of bases have a mean Q score of at least 30 q30_gate = (num_above_q30 / cycles) if q30_gate < 0.6: status_dict[key] = "FAIL" elif q30_gate < 0.8: status_dict[key] = "QUESTIONABLE" else: status_dict[key] = 'PASS' # populates data dictionaries for heatmap for score in range(quality_scores - 1, -1, -1): # create empty list for data. The format is a little strange, each list represents a position # the value inside of it is the score at that position divided by the total score for that position # giving a frequency. data.append([]) for pos in range(cycles): data[-1].append(json[key][read]["data"][score][pos] / total[pos]) # if this is the first sample process, lucky them, they get to be shown first and marked as active. # This step is necessary otherwise, the plot div is not initialized. The additional calls to the # heatmap function are simply to add the data to the internal jsons used by MultiQC. if first == True: active = "active" # button is default active first = False # shuts off first gat heatmap_html = heatmap.plot(data, x_lab, y_lab, heat_pconfig) else: active = "" # button is default off heatmap.plot(data, x_lab, y_lab, heat_pconfig) # html div attributes and text name = key pid = "htstream_" + btn_id + "_" + key + "_btn" button_list.append('<button class="btn btn-default btn-sm {a}" onclick="htstream_div_switch(this)" id="{pid}">{n}</button>\n'.format(a=active, pid=pid, n=name)) status_div = htstream_utils.sample_status(status_dict) line_plot = linegraph.plot(line_data, line_config) html = htstream_utils.qual_by_cycle_html(read, status_div, line_plot, btn_id, button_list, heatmap_html) return html
def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name='Salmon', anchor='salmon', href='http://combine-lab.github.io/salmon/', info= "is a tool for quantifying the expression of transcripts using RNA-seq data." ) # Parse meta information. JSON self.salmon_meta = dict() self.gc_bias = False for f in self.find_log_files('salmon/meta'): # Get the s_name from the parent directory s_name = os.path.basename(os.path.dirname(f['root'])) s_name = self.clean_s_name(s_name, f['root']) self.salmon_meta[s_name] = json.loads(f['f']) # Parse Fragment Length Distribution logs self.salmon_fld = dict() self.gc_bias_path_list = [] self.seq_bias_path_list = [] for f in self.find_log_files('salmon/fld'): # Get the s_name from the parent directory if os.path.basename(f['root']) == 'libParams': s_name = os.path.basename(os.path.dirname(f['root'])) s_name = self.clean_s_name(s_name, f['root']) parsed = OrderedDict() for i, v in enumerate(f['f'].split()): parsed[i] = float(v) if len(parsed) > 0: if s_name in self.salmon_fld: log.debug( "Duplicate sample name found! Overwriting: {}". format(s_name)) self.add_data_source(f, s_name) self.salmon_fld[s_name] = parsed ''' Check the meta_info.json file to check whether the salmon tool was run with gc bias and sequential bias. If ran with gc_bias then add its absolute path to the list of sample paths. Do same thing for seq_bias. ''' meta_json_file_path = os.path.join(os.path.dirname(f['root']), 'aux_info', 'meta_info.json') gc_bias_base_dir = os.path.dirname(f['root']) with open(meta_json_file_path, 'r') as meta_data_file: meta_info_data = json.load(meta_data_file) self.gc_bias = meta_info_data['gc_bias_correct'] self.seq_bias = meta_info_data['seq_bias_correct'] if self.gc_bias: self.gc_bias_path_list.append( os.path.abspath(gc_bias_base_dir)) if self.seq_bias: self.seq_bias_path_list.append( os.path.abspath(gc_bias_base_dir)) # Filter to strip out ignored sample names self.salmon_meta = self.ignore_samples(self.salmon_meta) self.salmon_fld = self.ignore_samples(self.salmon_fld) if len(self.salmon_meta) == 0 and len(self.salmon_fld) == 0: raise UserWarning if len(self.salmon_meta) > 0: log.info("Found {} meta reports".format(len(self.salmon_meta))) self.write_data_file(self.salmon_meta, 'multiqc_salmon') if len(self.salmon_fld) > 0: log.info("Found {} fragment length distributions".format( len(self.salmon_fld))) # Add alignment rate to the general stats table headers = OrderedDict() headers['percent_mapped'] = { 'title': '% Aligned', 'description': '% Mapped reads', 'max': 100, 'min': 0, 'suffix': '%', 'scale': 'YlGn' } headers['num_mapped'] = { 'title': 'M Aligned', 'description': 'Mapped reads (millions)', 'min': 0, 'scale': 'PuRd', 'modify': lambda x: float(x) / 1000000, 'shared_key': 'read_count' } self.general_stats_addcols(self.salmon_meta, headers) # Fragment length distribution plot pconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Fragment Length Distribution', 'ylab': 'Fraction', 'xlab': 'Fragment Length (bp)', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } ''' Iterate over the list of paths where each path has salmon output ran with gc_bias. Using the GCModel class's utlity functions compute observed array, expected array and the weights. Multiply the observed array and expected array with the corresponding weights and create a Ordered Dictionary, containing the ratio of observed by expected array. Plot that Ordered Dict using matplotlib. ''' self.gc_first_model_ratio = dict() self.gc_second_model_ratio = dict() self.gc_third_model_ratio = dict() self.gc_avg_ratio = dict() self.seq_three_prime = dict() self.seq_five_prime = dict() self.gc_average_data = [] self.gc_heatmap_labels = [] self.gc_heatmap_data = [] for path_var in self.gc_bias_path_list: gc_model = GCModel() gc_model.from_file(path_var) obs_array = gc_model.obs_.tolist() exp_array = gc_model.exp_.tolist() obs_weights = list(gc_model.obs_weights_) exp_weights = list(gc_model.exp_weights_) self.path_var = path_var.split('/')[-2] ratio_dict = dict() avg_ratio_dict = OrderedDict() for i in range(len(obs_array)): obs = obs_array[i] exp = exp_array[i] obs_weight = obs_weights[i] exp_weight = exp_weights[i] ratio_value = OrderedDict() j = 1 for o, e in zip(obs, exp): ratio = (o * obs_weight) / (e * exp_weight) ratio_value[j] = ratio try: avg_ratio_dict[j] += ratio except: avg_ratio_dict[j] = ratio j += 1 ratio_dict[i] = ratio_value for k in list(avg_ratio_dict.keys()): avg_ratio_dict[k] /= len(obs_array) self.gc_first_model_ratio[self.path_var] = ratio_dict[0] self.gc_second_model_ratio[self.path_var] = ratio_dict[1] self.gc_third_model_ratio[self.path_var] = ratio_dict[2] self.gc_avg_ratio[self.path_var] = avg_ratio_dict self.gc_average_data.append(list(avg_ratio_dict.values())) self.gc_heatmap_labels.append(self.path_var) for avg_data1 in self.gc_average_data: cosine_distance_vector = [] for avg_data2 in self.gc_average_data: cosine_distance_vector.append( spatial.distance.cosine(avg_data1, avg_data2)) self.gc_heatmap_data.append(cosine_distance_vector) files = list(self.gc_first_model_ratio.keys()) self.model_ratios = dict() firstModelAvg = OrderedDict() secondModelAvg = OrderedDict() thirdModelAvg = OrderedDict() for k in files: firstModel = self.gc_first_model_ratio[k] secondModel = self.gc_second_model_ratio[k] thirdModel = self.gc_third_model_ratio[k] for key in list(firstModel.keys()): try: firstModelAvg[key] += firstModel[key] secondModelAvg[key] += secondModel[key] thirdModelAvg[key] += thirdModel[key] except: firstModelAvg[key] = firstModel[key] secondModelAvg[key] = secondModel[key] thirdModelAvg[key] = thirdModel[key] for k in list(firstModelAvg.keys()): firstModelAvg[k] = float(firstModelAvg[k] / len(files)) secondModelAvg[k] = float(secondModelAvg[k] / len(files)) thirdModelAvg[k] = float(thirdModelAvg[k] / len(files)) modelAvg = { "First Model": firstModelAvg, "Second Model": secondModelAvg, "Third Model": thirdModelAvg } ''' For samples that were run with sequential bias, use the utility functions defined in the SeqModel.py class and read the values of the observed and expected bias values from the 3' and 5' end. Calculate the ratio of observed to expected from each read end for each nucleotide base. ''' # Variable declarations for storing the ratios. self.seq_3prime_ratio = dict() self.seq_5prime_ratio = dict() self.nucleotides = ['A', 'C', 'G', 'T'] self.seq_3prime_avg_data = [] self.seq_5prime_avg_data = [] # Iterate over all samples that were run with sequential bias and read values into the dictionaries. for path_var in self.seq_bias_path_list: seq_model = SEQModel() seq_model.from_file(path_var) obs3_array = seq_model.obs3_prime.tolist() exp3_array = seq_model.exp3_prime.tolist() obs5_array = seq_model.obs5_prime.tolist() exp5_array = seq_model.exp5_prime.tolist() self.path_var = path_var.split('/')[-2] ratio_dict_3prime = dict() ratio_dict_5prime = dict() avg_3prime_array = [0] * len(obs3_array[0]) avg_5prime_array = [0] * len(obs5_array[0]) for i in range(len(self.nucleotides)): obs_3prime = obs3_array[i] exp_3prime = exp3_array[i] obs_5prime = obs5_array[i] exp_5prime = exp5_array[i] # Ordered dictionaries to store the 3' and 5' end ratios. ratio_3prime_dict = OrderedDict() ratio_5prime_dict = OrderedDict() j = 1 for o, e in zip(obs_3prime, exp_3prime): ratio = o / e ratio_3prime_dict[j] = ratio avg_3prime_array[j - 1] += ratio j += 1 ratio_dict_3prime[self.nucleotides[i]] = ratio_3prime_dict j = 1 for o, e in zip(obs_5prime, exp_5prime): # Calculate observed/expected ratio and add the values to respective dictionary and average array. ratio = o / e ratio_5prime_dict[j] = ratio avg_5prime_array[j - 1] = ratio j += 1 ratio_dict_5prime[self.nucleotides[i]] = ratio_5prime_dict # Calculate the average bias values for each end and store in dictionary self.seq_3prime_avg_data.append( [x / len(self.nucleotides) for x in avg_3prime_array]) self.seq_5prime_avg_data.append( [x / len(self.nucleotides) for x in avg_5prime_array]) self.seq_3prime_ratio[self.path_var] = ratio_dict_3prime self.seq_5prime_ratio[self.path_var] = ratio_dict_5prime # Variables to hold the heatmap data. self.seq_3prime_heatmap_data = [] self.seq_5prime_heatmap_data = [] # Iterate over the average ratio values for each sample and calculate cosine similarity between pairs of samples. for avg_data1 in self.seq_3prime_avg_data: cosine_distance_vector = [] for avg_data2 in self.seq_3prime_avg_data: cosine_distance_vector.append( spatial.distance.cosine(avg_data1, avg_data2)) self.seq_3prime_heatmap_data.append(cosine_distance_vector) for avg_data1 in self.seq_5prime_avg_data: cosine_distance_vector = [] for avg_data2 in self.seq_5prime_avg_data: cosine_distance_vector.append( spatial.distance.cosine(avg_data1, avg_data2)) self.seq_5prime_heatmap_data.append(cosine_distance_vector) seq_heat_map_labels = [ x.split('/')[-2] for x in self.seq_bias_path_list ] """ Dictionary variables to store the ratio values for each nucleotide across samples. We plot seperate line plots for each nucleotide taken from each read end. """ A3_dict = dict() C3_dict = dict() G3_dict = dict() T3_dict = dict() A5_dict = dict() C5_dict = dict() T5_dict = dict() G5_dict = dict() for k in list(self.seq_3prime_ratio.keys()): A3_dict[k] = self.seq_3prime_ratio[k]['A'] C3_dict[k] = self.seq_3prime_ratio[k]['C'] G3_dict[k] = self.seq_3prime_ratio[k]['G'] T3_dict[k] = self.seq_3prime_ratio[k]['T'] for k in list(self.seq_5prime_ratio.keys()): A5_dict[k] = self.seq_5prime_ratio[k]['A'] C5_dict[k] = self.seq_5prime_ratio[k]['C'] G5_dict[k] = self.seq_5prime_ratio[k]['G'] T5_dict[k] = self.seq_5prime_ratio[k]['T'] # Variables to store the average sequential bias ratios for each nucleotide base across samples. A3_avg = dict() C3_avg = dict() G3_avg = dict() T3_avg = dict() A5_avg = dict() C5_avg = dict() T5_avg = dict() G5_avg = dict() files_count = len(self.seq_3prime_ratio.keys()) for k in list(self.seq_3prime_ratio.keys()): A3 = A3_dict[k] C3 = C3_dict[k] G3 = G3_dict[k] T3 = T3_dict[k] A5 = A5_dict[k] C5 = C5_dict[k] G5 = G5_dict[k] T5 = T5_dict[k] for key in list(A3.keys()): try: A3_avg[key] += A3[key] C3_avg[key] += C3[key] G3_avg[key] += G3[key] T3_avg[key] += T3[key] A5_avg[key] += A5[key] C5_avg[key] += C5[key] G5_avg[key] += G5[key] T5_avg[key] += T5[key] except: A3_avg[key] = A3[key] C3_avg[key] = C3[key] G3_avg[key] = G3[key] T3_avg[key] = T3[key] A5_avg[key] = A5[key] C5_avg[key] = C5[key] G5_avg[key] = G5[key] T5_avg[key] = T5[key] for key in list(A3_avg.keys()): A3_avg[key] = A3_avg[key] / files_count C3_avg[key] = C3_avg[key] / files_count G3_avg[key] = G3_avg[key] / files_count T3_avg[key] = T3_avg[key] / files_count A5_avg[key] = A5_avg[key] / files_count C5_avg[key] = C5_avg[key] / files_count G5_avg[key] = G5_avg[key] / files_count T5_avg[key] = T5_avg[key] / files_count self.seq_bias_avg = { "A3": A3_avg, "C3": C3_avg, "G3": G3_avg, "T3": T3_avg, "A5": A5_avg, "C5": C5_avg, "G5": G5_avg, "T5": T5_avg } # Section that contains plot configurations and calls to plot functions. if self.gc_bias_path_list: fconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: GC Bias Distribution in first model for different samples', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section( plot=linegraph.plot(self.gc_first_model_ratio, fconfig)) sconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: GC Bias Distribution in second model for different samples', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section( plot=linegraph.plot(self.gc_second_model_ratio, sconfig)) tconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: GC Bias Distribution in third model for different samples', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section( plot=linegraph.plot(self.gc_third_model_ratio, tconfig)) avgconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Avg GC Bias Distribution for across all samples', 'ylab': 'Average Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(modelAvg, avgconfig)) gcheatmapconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Average GC Bias similarity', 'ylab': 'Average Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section( plot=heatmap.plot(self.gc_heatmap_data, self.gc_heatmap_labels, self.gc_heatmap_labels, gcheatmapconfig)) if self.seq_bias_path_list: taprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 3\' prime end for nucleotide A', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(A3_dict, taprimeconfig)) tcprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 3\' prime end for nucleotide C', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(C3_dict, tcprimeconfig)) tgprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 3\' prime end for nucleotide G', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(G3_dict, tgprimeconfig)) ttprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 3\' prime end for nucleotide T', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(T3_dict, ttprimeconfig)) faprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 5\' end for nucleotide A', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(A5_dict, faprimeconfig)) fcprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 5\' end for nucleotide C', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(C5_dict, fcprimeconfig)) fgprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 5\' end for nucleotide G', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(G5_dict, fgprimeconfig)) ftprimeconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Sequence Bias Distribution for different experiments measured from 5\' end for nucleotide T', 'ylab': 'Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=linegraph.plot(T5_dict, ftprimeconfig)) seqavgconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Salmon: Avg Sequential Bias for each base across all samples for both 3\' and 5\' ends', 'ylab': 'Average Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section( plot=linegraph.plot(self.seq_bias_avg, seqavgconfig)) seq3primeheatmappconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Average Sequential Bias (3 Prime) similarity', 'ylab': 'Average Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=heatmap.plot( self.seq_3prime_heatmap_data, seq_heat_map_labels, seq_heat_map_labels, seq3primeheatmappconfig)) seq5sprimeheatmappconfig = { 'smooth_points': 500, 'id': 'salmon_plot', 'title': 'Average Sequential Bias (5 Prime) similarity', 'ylab': 'Average Ratio (Observed/Expected)', 'xlab': 'Read count', 'ymin': 0, 'xmin': 0, 'tt_label': '<b>{point.x:,.0f} bp</b>: {point.y:,.0f}', } self.add_section(plot=heatmap.plot( self.seq_5prime_heatmap_data, seq_heat_map_labels, seq_heat_map_labels, seq5sprimeheatmappconfig)) self.add_section(plot=linegraph.plot(self.salmon_fld, pconfig))