def regions_of_interest(self): """ Region of interest section. """ subseq = [self.start, self.stop] low_roi = self.rois.get_low_rois(subseq) high_roi = self.rois.get_high_rois(subseq) js = self.datatable.create_javascript_function() lroi = DataTable(low_roi, "lroi", self.datatable) hroi = DataTable(high_roi, "hroi", self.datatable) html_low_roi = lroi.create_datatable(float_format='%.3g') html_high_roi = hroi.create_datatable(float_format='%.3g') roi_paragraph = ( "<p>Regions with a z-score {0}er than {1:.2f} and at " "least one base with a z-score {0}er than {2:.2f} are detected as " "{0} coverage region. Thus, there are {3} {0} coverage regions " "between the position {4} and the position {5}</p>") low_paragraph = roi_paragraph.format("low", self.chromosome.thresholds.low2, self.chromosome.thresholds.low, len(low_roi), self.start, self.stop) high_paragraph = roi_paragraph.format("high", self.chromosome.thresholds.high2, self.chromosome.thresholds.high, len(high_roi), self.start, self.stop) self.sections.append({ "name": "Regions Of Interest (ROI)", "anchor": "roi", "content": "{4}\n" "<p>Running median is the median computed along the genome " "using a sliding window. The following tables give regions of " "interest detected by sequana. Here are some definition of the " "table's columns:</p>\n" "<ul><li><b>mean_cov</b>: the average of coverage</li>\n" "<li><b>mean_rm</b>: the average of running median</li>\n" "<li><b>mean_zscore</b>: the average of zscore</li>\n" "<li><b>max_zscore</b>: the higher zscore contains in the " "region</li></ul>\n" "<h3>Low coverage region</h3>\n{0}\n{1}\n" "<h3>High coverage region</h3>\n{2}\n{3}\n".format( low_paragraph, html_low_roi, high_paragraph, html_high_roi, js) })
def df2html(df, name=None, dom='Brt', show_index=False, pageLength=15): """Simple wrapper to create HTML from dataframe If a columns ends in _links and a name_links exists, then the columns name will be shown with the clickable name_links. """ if name is None: name = uuid.uuid1().time_low # looks like datatable does not like ID made of numbers, even in string # so we convert to ABCDEFGH values name = "".join([chr(65 + int(x)) for x in str(name)]) datatable = DataTable(df, name, index=show_index) datatable.datatable.datatable_options = { 'pageLength': pageLength, 'scrollCollapse': 'false', 'dom': dom, 'buttons': ['copy', 'csv'] } # identify links (columns ending in _links) for column in df.columns: if column.endswith('_links'): prefix = column.replace('_links', '') if prefix in df.columns: datatable.datatable.set_links_to_column(column, prefix) js = datatable.create_javascript_function() html = datatable.create_datatable(float_format='%.6g') return js + html
def variant_calling(self): """ Variants detected section. """ datatable = DataTable(self.df, 'vc') # set options datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3f') self.sections.append({ 'name': "Variants Detected", 'anchor': 'basic_stats', 'content': "<p>This table present variant detected by freebayes after " "filtering.</p>\n{0}\n{1}\n<p>Note: the freebayes score can be" " understood as 1 - P(locus is homozygous given the data)</p>". format(js, html_tab) })
def get_table_dependencies(self): """ Return dependencies of Sequana. """ dep_list = easydev.get_dependencies('sequana') # if installed with conda, this will be empty if len(dep_list) == 0: return "" project_name = list() version = list() link = list() pypi = 'https://pypi.python.org/pypi/{0}' for dep in dep_list: version.append(dep.version) project_name.append(dep.project_name) link.append(pypi.format(dep.project_name)) df = pd.DataFrame({ 'package': project_name, 'version': version, 'link': link }) df['sort'] = df['package'].str.lower() df.sort_values(by='sort', axis=0, inplace=True) df.drop('sort', axis=1, inplace=True) datatable = DataTable(df, 'dep') datatable.datatable.datatable_options = { 'paging': 'false', 'bFilter': 'false', 'bInfo': 'false', 'bSort': 'false' } datatable.datatable.set_links_to_column('link', 'package') js = datatable.create_javascript_function() html = datatable.create_datatable() return js + '\n' + html
def get_table_dependencies(self): """ Return dependencies of Sequana. """ dep_list = easydev.get_dependencies('sequana') # if installed with conda, this will be empty if len(dep_list) == 0: return "" project_name = list() version = list() link = list() pypi = 'https://pypi.python.org/pypi/{0}' for dep in dep_list: version.append(dep.version) project_name.append(dep.project_name) link.append(pypi.format(dep.project_name)) df = pd.DataFrame({'package': project_name, 'version': version, 'link': link}) df['sort'] = df['package'].str.lower() df.sort_values(by='sort', axis=0, inplace=True) df.drop('sort', axis=1, inplace=True) datatable = DataTable(df, 'dep') datatable.datatable.datatable_options = {'paging': 'false', 'bFilter': 'false', 'bInfo': 'false', 'bSort': 'false'} datatable.datatable.set_links_to_column('link', 'package') js = datatable.create_javascript_function() html = datatable.create_datatable() return js + '\n' + html
def _get_html_stats_section(self): df = self._get_stats() datatable = DataTable(df, "phix_stats", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() # Important that the columns of type integer are indeed in integer type # otherwise the %.3g herebelow would round integers. For instance 123456 # would appear as 123000. The dtypes must be taken care in _get_stats() # method html_tab = datatable.create_datatable(float_format='%.3g') html = """<p>We mapped the raw reads on a reference (see config file). The reads mapped are removed and the unmapped reads are kept for further cleaning (adapter removal). Here below are some statistics about the mapped and unmapped reads. </p><p> The A, C, G, T, N columns report the percentage of each bases in the overall sequences. The GC content column is in percentage. Finally, note that for paired data, the number of reads in the mapped files (R1 and R2) may differ due to . However, the unmapped reads must agree. </p>""" html += "{} {}".format(html_tab, js) return html
def add_table(self): df = self.summary.copy() df.columns = ['data'] df['url'] = ['http://sequana.readthedocs.org'] * len(df) table = DataTable(df, "table", index=True) table.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv'] } table.datatable.set_links_to_column('url', 'data') js = table.create_javascript_function() html_tab = table.create_datatable(float_format='%.3g') html = "{} {}".format(html_tab, js) self.sections.append({ "name": "Table", "anchor": "table", "content": html })
def summary(self): """ Add information of filter. """ Sdefault = self.rnadiff.summary() self.rnadiff.log2_fc = 1 S1 = self.rnadiff.summary() # set options options = { 'scrollX': 'true', 'pageLength': 20, 'scrollCollapse': 'true', 'dom': '', 'buttons': [] } S = pd.concat([Sdefault, S1]) N = len(Sdefault) df = pd.DataFrame({ 'comparison_link': [1] * len(S), 'comparison': S.index.values, 'Description': ['Number of DGE (any FC)'] * N + ['Number of DGE (|FC| > 1)'] * N, 'Down': S['down'].values, 'Up': S['up'].values, 'Total': S['all'].values }) df = df[[ 'comparison', 'Description', 'Down', 'Up', 'Total', 'comparison_link' ]] df['comparison_link'] = [f"#{name}_table_all" for name in Sdefault.index] + \ [f"#{name}_table_sign" for name in Sdefault.index] dt = DataTable(df, 'dge') dt.datatable.set_links_to_column('comparison_link', 'comparison', new_page=False) dt.datatable.datatable_options = options js_all = dt.create_javascript_function() html = dt.create_datatable(float_format='%d') self.sections.append({ 'name': "Summary", 'anchor': 'filters_option', 'content': f"""<p>Here below is a summary of thfinal Differententially Gene Expression (DGE) analysis. You can find two entries per comparison. The first one has no filter except for an adjusted p-value of 0.05. The second shows the expressed genes with a filter of the log2 fold change of 1 (factor 2 in a normal scale). Clicking on any of the link will lead you to section of the comparison. {js_all} {html} </p>""" })
def _get_html_stats(self): from sequana.tools import StatsBAM2Mapped from easydev import precision data = StatsBAM2Mapped(self.directory + "bwa_mem_stats.json").data html = "Reads with Phix: %s %%<br>" % precision(data['contamination'], 3) # add HTML table if "R2_mapped" in data.keys(): df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']], 'R2': [data['R2_mapped'], data['R2_unmapped']]}) else: df = pd.DataFrame({ 'R1': [data['R1_mapped'], data['R1_unmapped']]}) df.index = ['mapped', 'unmapped'] datatable = DataTable(df, "bwa_bam") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'irtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') #html += "{} {}".format(html_tab, js) html += "Unpaired: %s <br>" % data['unpaired'] html += "duplicated: %s <br>" % data['duplicated'] return html
def _get_html_stats_section(self): df = self._get_stats() datatable = DataTable(df, "phix_stats", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'tpB', "paging": "false", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() # Important that the columns of type integer are indeed in integer type # otherwise the %.3g herebelow would round integers. For instance 123456 # would appear as 123000. The dtypes must be taken care in _get_stats() # method html_tab = datatable.create_datatable(float_format='%.3g') html = """<p>We mapped the raw reads on a reference (see config file). The reads mapped are removed and the unmapped reads are kept for further cleaning (adapter removal). Here below are some statistics about the mapped and unmapped reads. </p><p> The A, C, G, T, N columns report the percentage of each bases in the overall sequences. The GC content column is in percentage. Finally, note that for paired data, the number of reads in the mapped files (R1 and R2) may differ due to . However, the unmapped reads must agree. </p>""" html += "{} {}".format(html_tab, js) return html
def add_main_section(self): links = glob.glob("{}".format(self.pattern)) names = [filename.rsplit('/',1)[1].split('.html')[0] for filename in links] df = pd.DataFrame({ "names": names, "links": [link.split(os.sep,1)[1] for link in links] }) df.sort_values(by='names') datatable = DataTable(df, "fastqc", index=False) datatable.datatable.set_links_to_column("links", "names") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable() html = "{} {}".format(html_tab, js) self.sections.append({ "name": "FastQC report(s)", "anchor": "fastqc", "content": "<p> Here below are link(s) to original FastQC report. " "Please click on one of the links to jump to the main " "report. {} </p>".format(html) })
def add_adapters_section(self): # Create a Table with adapters df = pd.DataFrame() df = pd.DataFrame({'Length': [], 'Trimmed':[], 'Type':[], 'Sequence': [], }) for count, adapter in enumerate(self.data['adapters']): name = adapter['name'] info = adapter['info'] df.ix[name] = [info['Length'], info['Trimmed'], info['Type'], info['Sequence']] df.columns = ['Length', 'Trimmed', 'Type', 'Sequence'] df['Trimmed'] = df.Trimmed.map(lambda x: int(x.replace("times.", ""))) # df.to_json(self.sample_name + "/cutadapt/cutadapt_stats2.json") df.sort_values(by="Trimmed", ascending=False, inplace=True) datatable = DataTable(df, "adapters", index=True) datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'frtipB', 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') self.jinja['adapters'] = "" self.sections.append({ "name": "Adapters", "anchor": "adapters", "content": "<p>{} {}</p>".format(html_tab, js) })
def chromosome_table(self, html_list): """ Create table with links to chromosome reports """ df = pd.DataFrame([[ chrom.chrom_name, chrom.get_size(), chrom.get_mean_cov(), chrom.get_var_coef(), page ] for chrom, page in zip(self.bed.chr_list, html_list)], columns=[ "chromosome", "size", "mean_coverage", "coef_variation", "link" ]) datatable = DataTable(df, 'chrom') datatable.datatable.datatable_options = { 'pageLength': 15, 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } datatable.datatable.set_links_to_column('link', 'chromosome') js = datatable.create_javascript_function() html_table = datatable.create_datatable(float_format='%.3g') self.sections.append({ "name": "Chromosomes", "anchor": "chromosomes", "content": "<p>Link to coverage analysis report for each chromosome. " "Size, mean coverage and coefficient of variation are reported" " in the table below.</p>\n{0}\n{1}".format(js, html_table) })
def _get_summary_section(self): df = self._get_stats() if len(df) == 1 and df.iloc[0]['taxon'] == -1: pngimage = sequana_data("no_data.jpg") extra = "<p> no reads could be identified with the given the database(s)." else: pngimage = self.directory + os.sep + "kraken.png" extra = """<p>The following <b>clickable image</b> is a simplified version (only genus are shown) of an interactive and more detailled version based on Krona. Finally, note that the unclassified species in the pie plot may correspond to species not present in the data base or adapters (if not removed).</p>""" html = """ <p>Overview of the Taxonomic content of the filtered reads. </p> <p>The taxonomic analysis is performed with Kraken (see database name in the configuration file. The analysis is performed with a Kmer approach. The details about the database itself are available in the <a href="http://sequana.readthedocs.io">Sequana documentation</a>. The taxonomic analysis should give a good idea of the content of the FastQ files but should be used as a sanity check. Indeed, species absent from the database won't be detected leading to false detection (close species may be detected instead). Besides, be aware that closely related species may not be classified precisely. </p> {0} <div style="text-align:center"><a href="./{1}/kraken.html"> {2} </a></div> <br> """.format(extra, self.directory.split(os.sep, 1)[1], self.png_to_embedded_png(pngimage)) datatable = DataTable(df, "kraken", index=False) # add links if "ena" in df.columns: urlena = "http://www.ebi.ac.uk/ena/data/view/" datatable.datatable.set_links_to_column( "ena", [urlena + this for this in df['ena']]) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'irtpB', "paging": "false", "order": [[2, "desc"]], 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html += "{} {}".format(html_tab, js) """# Rounding and convert in string to avoid exp notation df['percentage'] = df['percentage'].apply(lambda x: str(round(x,4))) #self.jinja['kraken_json'] = df.to_json()""" return html
def get_html_table(self, user_key_list): df = self.get_single_data(user_key_list) datatable = DataTable(df, 'name') datatable.datatable.datatable_options = { 'pageLength': 15, 'scrollCollapse': 'false', 'dom': 'Brt', 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html = datatable.create_datatable(float_format='%.6g') return js + html
def _get_stats_section(self, tablename="stats"): self.df_stats = self.get_stats() filenames, mode = self._get_files("*boxplot.png") datatable = DataTable(self.df_stats, tablename, index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = """<p>The following table gives some basic statistics about the data before any filtering. The A, C, G, T, N columns report the percentage of each bases in the overall sequences. The GC content is provided in percentage as well. </p> <div>{} {}</div> <div>""".format(html_tab, js) html += """ <p>The following figure(s) gives the average quality (red line) of raw reads (500,000 at max). The x-axis being the length of the reads. The yellow enveloppe gives the variation of the quality (1 standard deviation).</p> <p> Click on the image to jump to a full FastQC report.</p>""" if len(filenames) == 2: width = "49" else: width = "65" filename = os.path.split(filenames[0])[1].replace( "_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:left; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig1: R1 reads</figcaption> </figure>""".format(width, href, self.png_to_embedded_png(filenames[0])) if len(filenames) == 2: filename = os.path.split(filenames[1])[1].replace( "_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:right; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig2: R2 reads</figcaption> </figure>""".format(width, href, self.png_to_embedded_png(filenames[1])) return html
def _get_stats_section(self, tablename="stats"): self.df_stats = self.get_stats() filenames, mode = self._get_files("*boxplot.png") datatable = DataTable(self.df_stats, tablename, index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = """<p>The following table gives some basic statistics about the data before any filtering. The A, C, G, T, N columns report the percentage of each bases in the overall sequences. The GC content is provided in percentage as well. </p> <div>{} {}</div> <div>""".format(html_tab, js) html += """ <p>The following figure(s) gives the average quality (red line) of raw reads (500,000 at max). The x-axis being the length of the reads. The yellow enveloppe gives the variation of the quality (1 standard deviation).</p> <p> Click on the image to jump to a full FastQC report.</p>""" if len(filenames)==2: width="49" else: width="65" filename = os.path.split(filenames[0])[1].replace("_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:left; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig1: R1 reads</figcaption> </figure>""".format(width, href, self.png_to_embedded_png(filenames[0])) if len(filenames) == 2: filename = os.path.split(filenames[1])[1].replace("_boxplot.png", "_fastqc.html") href = self.path_to_fastqc + os.sep + filename html += """ <figure style="float:right; width:{}%; padding:0px; margin:0px;"> <a href="{}">{}</a> <figcaption style="font-style:italic">Fig2: R2 reads</figcaption> </figure>""".format(width, href, self.png_to_embedded_png(filenames[1])) return html
def test_datatables(): bed = bedtools.GenomeCov(sequana_data("JB409847.cov.csv"), sequana_data("JB409847.gbk")) rois = bed[0].get_roi() rois.df['link'] = 'test' datatable_js = DataTableFunction(rois.df, 'roi') datatable_js.set_links_to_column('link', 'start') datatable_js.datatable_options = { 'scrollX': 'true', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } datatable = DataTable(rois.df, 'rois', datatable_js) html_table = datatable.create_datatable(float_format='%.3g')
def _get_stat_section(self): datatable = DataTable(self._get_stats(), "cutadapt", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') #csv_link = self.create_link('link', self.filename) #vcf_link = self.create_link('here', 'test.vcf') html = "Reads statistics after trimming and adapter removal. The " +\ "A, C, G, T, N columns report the percentage of each bases in " +\ "the overall sequences" html += "<p>{} {}</p>".format(html_tab, js) return html
def _get_stat_section(self): datatable = DataTable(self._get_stats(), "cutadapt", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') #csv_link = self.create_link('link', self.filename) #vcf_link = self.create_link('here', 'test.vcf') html = "Reads statistics after trimming and adapter removal. The " +\ "A, C, G, T, N columns report the percentage of each bases in " +\ "the overall sequences" html += "<p>{} {}</p>".format(html_tab, js) return html
def test_datatables(): bed = bedtools.GenomeCov(sequana_data("JB409847.bed"), sequana_data("JB409847.gbk")) fasta = sequana_data("JB409847.fasta") bed.compute_gc_content(fasta) c = bed.chr_list[0] c.run(4001) rois = c.get_rois() rois.df['link'] = 'test' datatable_js = DataTableFunction(rois.df, 'roi') datatable_js.set_links_to_column('link', 'start') datatable_js.datatable_options = {'scrollX': 'true', 'pageLength': 15, 'scrollCollapse' : 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv']} datatable = DataTable(rois.df, 'rois', datatable_js) html_table = datatable.create_datatable(float_format='%.3g')
def test_datatables(): bed = bedtools.GenomeCov(sequana_data("JB409847.bed"), sequana_data("JB409847.gbk")) fasta = sequana_data("JB409847.fasta") bed.compute_gc_content(fasta) c = bed.chr_list[0] c.run(4001) rois = c.get_rois() rois.df['link'] = 'test' datatable_js = DataTableFunction(rois.df, 'roi') datatable_js.set_links_to_column('link', 'start') datatable_js.datatable_options = { 'scrollX': 'true', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } datatable = DataTable(rois.df, 'rois', datatable_js) html_table = datatable.create_datatable(float_format='%.3g')
def add_adapters_section(self): # Create a Table with adapters df = pd.DataFrame() df = pd.DataFrame({'Length': [], 'Trimmed':[], 'Type':[], 'Sequence': [], }) for count, adapter in enumerate(self.data['adapters']): name = adapter['name'] info = adapter['info'] df.loc[name] = [info['Length'], info['Trimmed'], info['Type'], info['Sequence']] df.columns = ['Length', 'Trimmed', 'Type', 'Sequence'] try: df['Trimmed'] = df.Trimmed.map(lambda x: int(x.replace("times.", ""))) except: pass try: df['Trimmed'] = df.Trimmed.map(lambda x: int(x.replace("times", ""))) except: pass # df.to_json(self.sample_name + "/cutadapt/cutadapt_stats2.json") df.sort_values(by="Trimmed", ascending=False, inplace=True) datatable = DataTable(df, "adapters", index=True) datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'frtipB', 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') self.jinja['adapters'] = "" self.sections.append({ "name": "Adapters", "anchor": "adapters", "content": "<p>{} {}</p>".format(html_tab, js) })
def add_table(self): datatable = DataTable(self.trf.df, "result", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tBifp', "paging": "true", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "" html += "{} {}".format(html_tab, js) self.sections.append({ "name": "TRF results", "anchor": "results", "content": html })
def add_main_section(self): links = glob.glob("{}".format(self.pattern)) names = [ filename.rsplit('/', 1)[1].split('.html')[0] for filename in links ] df = pd.DataFrame({ "names": names, "links": [link.split(os.sep, 1)[1] for link in links] }) df.sort_values(by='names') datatable = DataTable(df, "fastqc", index=False) datatable.datatable.set_links_to_column("links", "names") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable() html = "{} {}".format(html_tab, js) self.sections.append({ "name": "FastQC report(s)", "anchor": "fastqc", "content": "<p> Here below are link(s) to original FastQC report. " "Please click on one of the links to jump to the main " "report. {} </p>".format(html) })
def add_flag_section(self): data = self._computation() df = data['flags'] datatable = DataTable(df, "flags", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "" html += "{} {}".format(html_tab, js) self.sections.append({ "name": "Flags information", "anchor": "flags", "content": html })
def add_table(self): df = self.summary.copy() df.columns = ['data'] df['url'] = ['http://sequana.readthedocs.org'] * len(df) table = DataTable(df, "table", index=True) table.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv']} table.datatable.set_links_to_column('url', 'data') js = table.create_javascript_function() html_tab = table.create_datatable(float_format='%.3g') html = "{} {}".format(html_tab, js) self.sections.append({ "name": "Table", "anchor": "table", "content": html })
def get_html_table(this_df, identifier): df = this_df.copy() links = ["https://www.ebi.ac.uk/QuickGO/term/{}".format(x) for x in df["id"]] df['links'] = links for x in ['term', 'fdr2', 'abs_log2_fold_enrichment', 'pct_diff_expr']: try:del df[x] except:pass first_col = df.pop("id") df.insert(0, "id", first_col) df = df.sort_values(by="fold_enrichment", ascending=False) datatable = DataTable(pd.DataFrame(df), identifier) datatable.datatable.set_links_to_column("links", "id") datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 10, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_table = datatable.create_datatable(float_format='%E') return js + html_table
def add_stats(self): df = pd.Series(self.summary['read_stats']).to_frame().T df.index = ['read length stats'] table = DataTable(df, "table", index=True) table.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 't', "paging": "false", 'buttons': ['copy', 'csv'] } js = table.create_javascript_function() # IMPORTANT: here conversion to integer with %d # to round and make integer. !! The GC is therefore # converted to integer as well. html_tab = table.create_datatable(float_format='%d') html = "{} {}".format(html_tab, js) self.sections.append({ "name": "Basic stats on read length", "anchor": "table", "content": html })
def add_flag_section(self): data = self._computation() df = data['flags'] datatable = DataTable(df, "flags", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "" html += "{} {}".format(html_tab, js) self.sections.append({ "name": "Flags information", "anchor": "flags", "content": html })
def regions_of_interest(self, rois, links): """ Region of interest section. """ # add links to the roi x = 200000 i = 0 def connect_link(n, x, i): condition = True while condition: if n > x: x += 200000 i += 1 else: condition = False return i links_list = [ links[connect_link(n[0], x, i)] for n in zip(rois.df['start']) ] rois.df['link'] = links_list # create datatable low_roi = rois.get_low_roi() high_roi = rois.get_high_roi() js = self.datatable.create_javascript_function() lroi = DataTable(low_roi, "lroi", self.datatable) hroi = DataTable(high_roi, "hroi", self.datatable) html_low_roi = lroi.create_datatable(float_format='%.3g') html_high_roi = hroi.create_datatable(float_format='%.3g') rois.df.drop('link', 1, inplace=True) roi_paragraph = ( "<p>Regions with a z-score {0}er than {1:.2f} and at " "least one base with a z-score {0}er than {2:.2f} are detected as " "{0} coverage region. Thus, there are {3} {0} coverage regions." "</p>") low_paragraph = roi_paragraph.format("low", self.chromosome.thresholds.low2, self.chromosome.thresholds.low, len(low_roi)) high_paragraph = roi_paragraph.format("high", self.chromosome.thresholds.high2, self.chromosome.thresholds.high, len(high_roi)) self.sections.append({ 'name': "Regions Of Interest (ROI)", 'anchor': 'roi', 'content': "{4}\n" "<p>Running median is the median computed along the genome " "using a sliding window. The following tables give regions of " "interest detected by sequana. Here are the definitions of the " "columns:</p>\n" "<ul><li>mean_cov: the average of coverage</li>\n" "<li>mean_rm: the average of running median</li>\n" "<li>mean_zscore: the average of zscore</li>\n" "<li>max_zscore: the higher zscore contains in the region</li>" "</ul>\n" "<h3>Low coverage region</h3>\n{0}\n{1}\n" "<h3>High coverage region</h3>\n{2}\n{3}\n".format( low_paragraph, html_low_roi, high_paragraph, html_high_roi, js) })
def add_kegg(self): logger.info("Enrichment module: kegg term") style="width:45%" from sequana.enrichment import KeggPathwayEnrichment ke = KeggPathwayEnrichment(self.gene_lists, self.organism, mapper=self.enrichment_params["mapper"], log2_fc=self.enrichment_params['log2_fc'], background=self.enrichment_params['kegg_background'], preload_directory=self.enrichment_params['preload_directory']) logger.info("Saving all pathways in kegg_pathways/mmu") ke.export_pathways_to_json() # Image kegg pathways down def plot_barplot_down(filename): ke.barplot('down') pylab.savefig(filename) img_barplot_down = self.create_embedded_png(plot_barplot_down, "filename", style=style) def plot_scatter_down(filename): ke.scatterplot('down') pylab.savefig(filename) img_scatter_down = self.create_embedded_png(plot_scatter_down, "filename", style=style) # Image kegg pathways up def plot_barplot_up(filename): ke.barplot('up') pylab.savefig(filename) img_barplot_up = self.create_embedded_png(plot_barplot_up, "filename", style=style) def plot_scatter_up(filename): ke.scatterplot('up') pylab.savefig(filename) img_scatter_up = self.create_embedded_png(plot_scatter_up, "filename", style=style) # Results down (pathway info) html_before_table = """<p>Enrichment pathways summary</p>""" df_down = ke.barplot('down') if len(df_down): links = ["https://www.genome.jp/dbget-bin/www_bget?path:{}".format(x) for x in df_down["pathway_id"]] df_down['links'] = links df_down = df_down[["pathway_id", "name", "size", "Overlap", "P-value", "Adjusted P-value", "Genes", "links"]] # save pathways and add fotorama logger.setLevel("WARNING") pb = Progress(len(df_down)) files = [] for i, ID in enumerate(df_down['pathway_id']): df = ke.save_pathway(ID, self.data, filename=f"{config.output_dir}/{ID}.png") files.append(f"{ID}.png") pb.animate(i+1) fotorama_down = self.add_fotorama(files, width=800) datatable = DataTable(df_down, 'kegg_down') datatable.datatable.set_links_to_column("links", "pathway_id") datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 20, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } js_table_down = datatable.create_javascript_function() html_table_down = datatable.create_datatable(float_format='%E') # Results up (pathway info) df_up = ke.barplot('up') if len(df_up): links = ["https://www.genome.jp/dbget-bin/www_bget?path:{}".format(x) for x in df_up["pathway_id"]] df_up['links'] = links df_up = df_up[["pathway_id", "name", "size", "Overlap", "P-value", "Adjusted P-value", "Genes", "links"]] datatable = DataTable(df_up, 'kegg_up') datatable.datatable.set_links_to_column("links", "pathway_id") datatable.datatable.datatable_options = { 'scrollX': 'true', 'pageLength': 20, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } js_table_up = datatable.create_javascript_function() html_table_up = datatable.create_datatable(float_format='%E') pb = Progress(len(df_up)) files = [] for i, ID in enumerate(df_up['pathway_id']): df = ke.save_pathway(ID, self.data, filename=f"{config.output_dir}/{ID}.png") files.append(f"{ID}.png") pb.animate(i+1) fotorama_up = self.add_fotorama(files, width=800) #logger.setLevel(level) Ndown = len(df_down) Nup = len(df_up) if Ndown == 0: img_barplot_down = "" img_scatter_down = "" fotorama_down = "" js_table_down = "" html_table_down = "" if Nup == 0: img_barplot_up = "" img_scatter_up = "" fotorama_up = "" js_table_up = "" html_table_up = "" html = f""" <h3>2.1 - KEGG pathways down regulated</h3> <p>{Ndown} KEGG pathways are found to be down regulated</p> <br> {img_barplot_down} {img_scatter_down} <hr> {js_table_down} {html_table_down} <hr> {fotorama_down} <h3>2.1 - KEGG pathways up regulated</h3> <p>{Nup} KEGG pathways are found to be up regulated</p> <br> {img_barplot_up} {img_scatter_up} <hr> {js_table_up} {html_table_up} <hr> {fotorama_up} """ self.sections.append({"name": "2 - KEGG", "anchor": "kegg", "content": html})
def regions_of_interest(self, rois, links): """ Region of interest section. """ def connect_link(x): for link in links: _, x1, x2 = link.rsplit(os.sep)[1].rstrip(".html").rsplit("_", 2) x1 = int(x1) x2 = int(x2) if x >= x1 and x<=x2: return link # for the case where the data is fully stored in memory, we must # find all events ! if self.chromosome._mode == "memory" and self.chromosome.binning ==1: raise Exception("{} position not in the range of reports".format(x)) if links: links_list = [connect_link(n) for n in rois.df['start']] else: links_list = [None for n in rois.df['start']] rois.df['link'] = links_list # create datatable low_roi = rois.get_low_rois() high_roi = rois.get_high_rois() datatable = CoverageModule.init_roi_datatable(low_roi) datatable.set_links_to_column('link', 'chr') js = datatable.create_javascript_function() lroi = DataTable(low_roi, "lroi", datatable) hroi = DataTable(high_roi, "hroi", datatable) html_low_roi = lroi.create_datatable(float_format='%.3g') html_high_roi = hroi.create_datatable(float_format='%.3g') rois.df.drop('link', 1, inplace=True) roi_paragraph = ( "<p>Regions with a z-score {0}er than {1:.2f} and at " "least one base with a z-score {0}er than {2:.2f} are detected." "There are {3} {0} regions of interest." "</p>" ) low_paragraph = roi_paragraph.format("low", self.chromosome.thresholds.low2, self.chromosome.thresholds.low, len(low_roi)) high_paragraph = roi_paragraph.format("high", self.chromosome.thresholds.high2, self.chromosome.thresholds.high, len(high_roi)) self.sections.append({ 'name': "Regions Of Interest (ROI)", 'anchor': 'roi', 'content': "{4}\n" "<p>The following tables give regions of " "interest detected by sequana. Here are the definitions of the " "columns:</p>\n" "<ul><li>mean_cov: the average of coverage</li>\n" "<li>mean_rm: the average of running median</li>\n" "<li>mean_zscore: the average of zscore</li>\n" "<li>max_zscore: the higher zscore contains in the region</li>" "</ul>\n" "<h3>Low coverage region</h3>\n{0}\n{1}\n" "<h3>High coverage region</h3>\n{2}\n{3}\n".format( low_paragraph, html_low_roi, high_paragraph, html_high_roi, js) })
def add_individual_report(self, comp, name, counter): style = "width:45%" description = """<p> In the dispersion estimation and model fitting is done, statistical testing is performed. The distribution of raw p-values computed by the statistical test is expected to be a mixture of a uniform distribution on [0, 1] and a peak around 0 corresponding to the differentially expressed features. This may not always be the case. </p>""" def plot_pvalue_hist(filename): import pylab pylab.ioff() pylab.clf() comp.plot_pvalue_hist() pylab.savefig(filename) pylab.close() def plot_padj_hist(filename): import pylab pylab.ioff() pylab.clf() comp.plot_padj_hist() pylab.savefig(filename) pylab.close() img1 = self.create_embedded_png(plot_pvalue_hist, "filename", style=style) img2 = self.create_embedded_png(plot_padj_hist, "filename", style=style) # FIXME. pvalues adjusted are not relevant so commented for now img2 = "" self.sections.append({ "name": f"6.{counter}.a pvalue distribution ({name})", "anchor": f"dge_summary", "content": description + img1 + img2 }) def plot_volcano(filename): import pylab pylab.ioff() pylab.clf() comp.plot_volcano() pylab.savefig(filename) pylab.close() html_volcano = """<p>The volcano plot here below shows the differentially expressed features with a adjusted p-value below 0.05 (dashed back line). The volcano plot represents the log10 of the adjusted P value as a function of the log2 ratio of differential expression. </p>""" #img3 = self.create_embedded_png(plot_volcano, "filename", style=style) img3 = "" fig = comp.plot_volcano(plotly=True, annotations=self.rnadiff.annotation) from plotly import offline plotly = offline.plot(fig, output_type="div", include_plotlyjs=False) self.sections.append({ "name": f"6.{counter}.b volcano plots ({name})", "anchor": f"{name}_volcano", "content": html_volcano + img3 + "<hr>" + plotly }) # finally, let us add the tables from pylab import log10 df = comp.df.copy() #.reset_index() # here we need to add the annotation if possible try: df = pd.concat( [df, self.rnadiff.annotation.annotation.loc[comp.df.index]], axis=1) except Exception as err: logger.critical(f"Could not add annotation. {err}") df = df.reset_index() fold_change = 2**df['log2FoldChange'] log10padj = -log10(df['padj']) df.insert( df.columns.get_loc('log2FoldChange') + 1, 'FoldChange', fold_change) df.insert(df.columns.get_loc('padj') + 1, 'log10_padj', log10padj) try: del df['dispGeneEst'] #del df['dispFit'] #del df['dispMap'] except: pass for x in ['lfcSE', 'stat', 'dispersion']: try: del df[x] except: pass # set options options = { 'scrollX': 'true', 'pageLength': 10, 'scrollCollapse': 'true', 'dom': 'Bfrtip', 'buttons': ['copy', 'csv'] } datatable = DataTable(df, f'{name}_table_all') datatable.datatable.datatable_options = options js_all = datatable.create_javascript_function() html_tab_all = datatable.create_datatable(float_format='%.3e') df_sign = df.query( "padj<=0.05 and (log2FoldChange>1 or log2FoldChange<-1)") datatable = DataTable(df_sign, f'{name}_table_sign') datatable.datatable.datatable_options = options js_sign = datatable.create_javascript_function() html_tab_sign = datatable.create_datatable(float_format='%.3e') self.sections.append({ 'name': f"6.{counter}.c {name} Tables ({name})", 'anchor': f"{name} stats", 'content': f"""<p>The following tables give all DGE results. The first table contains all significant genes (adjusted p-value below 0.05 and absolute fold change of at least 0.5). The following tables contains all results without any filtering. Here is a short explanation for each column: <ul> <li> baseMean: base mean over all samples</li> <li> norm.sampleName: rounded normalized counts per sample</li> <li> FC: fold change in natural base</li> <li> log2FoldChange: log2 Fold Change estimated by the model. Reflects change between the condition versus the reference condition</li> <li> stat: Wald statistic for the coefficient (contrast) tested</li> <li> pvalue: raw p-value from statistical test</li> <li> padj: adjusted pvalue. Used for cutoff at 0.05 </li> <li> betaConv: convergence of the coefficients of the model </li> <li> maxCooks: maximum Cook's distance of the feature </li> <li> outlier: indicate if the feature is an outlier according to Cook's distance </li> </ul> </p> <h3>Significative only<a id="{name}_table_sign"></a></h3> here below is a subset of the next table. It contains all genes below adjusted p-value of 0.05 and absolute log2 fold change above 1. {js_sign} {html_tab_sign} <h3>All genes<a id="{name}_table_all"></a></h3> {js_all} {html_tab_all}""" })
def add_section(self): logger.info("Found %s projects/samples/ directories" % len(self.summaries)) for filename in self.filenames: logger.info(filename) self.jinja = {} self.jinja['canvas'] = '<script type="text/javascript" src="js/canvasjs.min.js"></script>' self.jinja['canvas'] += """<script type="text/javascript"> window.onload = function () {""" # Information to put on top of the page (added later in a module.intro) # We should get the link name from the project name contained in the json links = [{'href': filename.replace(".json", ".html"),'caption': project} for filename, project in zip(self.filenames,self.projects)] introhtml = "<div><b>Number of samples:</b>{}</div>".format(len(self.summaries)) #introhtml += '<div class="multicolumns"><ul>' #for link in links: # introhtml += ' <li><a href="{}">{}</a></li> '.format( # link["href"], link["caption"]) #introhtml += '\n</ul>\n</div>' self.jinja['sections'] = [] # This will used to stored all information self.df = {} # The order does not matter here, everything is done in JINJA try:self.populate_nreads_raw() except Exception as err: print(err) try: self.populate_phix() except Exception as err: logger.debug("multi_summary: skip phix") try: self.populate_gc_samples() except Exception as err: logger.debug("multi_summary: skip gc samples") try: self.populate_trimming() except Exception as err: logger.debug("multi_summary: skip trimming") try: self.populate_mean_quality() except Exception as err: logger.debug("multi_summary: skip mean quality") try: self.populate_adapters() except Exception as err: logger.debug("multi_summary: skip adapters") try: self.populate_output_total_reads() except Exception as err: logger.debug("multi_summary: skip total reads") # Now we have all data in df as dictionaries. Let us merge them together keys = list(self.df.keys()) if len(keys) >= 1: df = pd.DataFrame(self.df[keys[0]]) if len(keys) > 1: # we can merge things for key in keys[1:]: df = pd.merge(df, pd.DataFrame(self.df[key]), on=['name', 'url']) # For the quality_control pipeline columns = [] for this in ["name", "url", "N_raw", "GC_raw_(%)", "Mean_quality_raw", 'Phix_content_(%)', "Adapters_content_(%)", "Trimmed_reads_(%)", "N_final" ]: if this in df.columns: columns.append(this) df = df[columns] df.rename(columns={"name": "Sample name"}, inplace=True) from sequana.utils.datatables_js import DataTable datatable = DataTable(df, "multi_summary") datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 30, 'scrollCollapse': 'true', 'dom': 'rtpB', "paging": "false", 'buttons': ['copy', 'csv']} datatable.datatable.set_links_to_column("url", "Sample name") js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "{} {}".format(html_tab, js) self.jinja['canvas'] += """ function onClick(e){ window.open(e.dataPoint.url) } }</script>""" caption = """<p>The table below gives a brief summary of the analysis. The first column contains clickable sample name that redirects to complete summary page. The table contains the following columns:</p> <b>Table caption</b> <table> <tr><td>N_raw</td><td>Number of reads in the data</td></tr> <tr><td>GC_raw_(%)</td><td>GC content in percentage in the raw data across all reads</td></tr> <tr><td>Mean_quality_raw</td><td>Mean quality across all reads all bases in the raw data</td></tr> <tr><td>Phix_content_(%)</td><td>Percentage of reads found with Phix174</td></tr> <tr><td>Adapters_content_(%)</td><td>Percentage of reads with adapters (after phix removal if applied) </td></tr> <tr><td>Trimmed_reads_(%)</td><td>Percentage of reads trimmed (after phix and adapter removal)</td></tr> <tr><td>N_final</td><td>Final number of reads (after phix and adapter removal and trimming)</td></tr> </table> """ infohtml = self.create_hide_section('information', '(Show information)', caption, True) infohtml = "\n".join(infohtml) self.intro = introhtml + """ <hr><b>Summary</b>: """ + infohtml +html self.sections.append({ 'name': None, 'anchor': None, 'content': self.jinja['canvas'] + "\n".join(self.jinja['sections']) })