def _write_genome_summary(self, output_file): """Summarize classification of each genome. Parameters ---------- output_file : str Output file. """ fout = open(output_file, 'w') fout.write('Genome id\tLength (bp)\t# sequences') for rank in self.rank_labels: fout.write('\t' + rank + ': taxa') fout.write('\t' + rank + ': percent of bps') fout.write('\t' + rank + ': percent of sequences') fout.write('\t' + rank + ': avg. evalue') fout.write('\t' + rank + ': avg. perc identity') fout.write('\t' + rank + ': avg. align length (AA)') fout.write('\n') sorted_genome_ids = alphanumeric_sort(self.profiles.keys()) for genome_id in sorted_genome_ids: self.profiles[genome_id].write_genome_summary(fout) fout.close()
def create_html_index(self, plot_dir, genome_plots): """Create HTML index for navigating outlier plots. Parameters ---------- plot_dir : str Directory containing plots. genome_plots : d[genome_id] -> [(plot_type, plot_filename), ...] Hash indicating the plot types and filenames for each genome of interest. """ sorted_genome_ids = alphanumeric_sort(genome_plots.keys()) starting_plot_filename = genome_plots[sorted_genome_ids[0]][0][1] starting_plot_str = sorted_genome_ids[0] + '<br>' + genome_plots[sorted_genome_ids[0]][0][0] fout = open(os.path.join(plot_dir, 'index.html'), 'w') fout.write('<html>\n') fout.write('<head>') fout.write('<title>RefineM outlier plots</title>\n') fout.write('</head>\n') fout.write('<frameset cols="15%,85%">\n') fout.write('<frame src="plot_menu.html" name="menu">\n') fout.write('<frame src="%s" name="plot">\n' % starting_plot_filename) fout.write('</frameset>\n') fout.write('</html>\n') fout.close() fout = open(os.path.join(plot_dir, 'plot_menu.html'), 'w') fout.write('<html>\n') fout.write('<script>\n') fout.write(' function change_title(name) {\n') fout.write(' document.getElementById("active_plot").innerHTML = name;\n') fout.write(' }\n') fout.write('</script>\n\n') fout.write('<style>\n') fout.write('ul {\n') fout.write('margin-top: 0px;\n') fout.write('margin-bottom: 12px;\n') fout.write('}\n') fout.write('</style>\n\n') fout.write('<body>\n') fout.write('<div><b>Active plot:</b>\n') fout.write('<div id="active_plot">%s</div>\n' % starting_plot_str) fout.write('</div>\n') fout.write('<br>\n') fout.write('<div><b>Plots:</b></div>\n') for genome_id in sorted_genome_ids: fout.write('<i> %s:</i>\n' % genome_id) fout.write(' <ul>\n') for (plot_type, plot_filename) in genome_plots[genome_id]: fout.write(' <li><a href="%s" target="plot" onclick="change_title(\'%s\');">%s</a><br></li>\n' % (plot_filename, genome_id + '<br>' + plot_type, plot_type)) fout.write(' </ul>\n') fout.write('</body>\n') fout.write('</html>\n') fout.close()
def create(self, profiles, output_file): """Create Krona plot. Profiles for multiple items (e.g., genome, metagenome) can be specified. The complete hierarchy for each unique element should be specified as a semicolon separated string, e.g., k__Bacteria;c__Firmicutes;...;s__ The number of hits to each unique element is specified in the profiles dictionary, e.g., d[unique_id][element_str] = 10 Parameters ---------- profiles: d[unique_id][element_str] -> count Number of hits to specific elements for each item. output_file : str Name of output file. """ # create temporary files for each item cmd = 'ktImportText -o %s' % output_file tmp_dir = tempfile.mkdtemp() for unique_id in alphanumeric_sort(list(profiles.keys())): tmp_file = os.path.join(tmp_dir, unique_id) fout = open(tmp_file, 'w') for element_str, num_hits in profiles[unique_id].items(): elements = [x.strip() for x in element_str.split(';')] fout.write(str(num_hits) + '\t' + '\t'.join(elements) + '\n') fout.close() cmd += ' %s,%s' % (tmp_file, unique_id) # create krona plot execute.run(cmd) # clean up temporary files shutil.rmtree(tmp_dir)
def write(self, output_file): """Write genome statistics to file. Parameters ---------- output_file : str Name of output file. """ fout = open(output_file, 'w') fout.write('Genome id\tGenome size (bp)') fout.write('\tMean GC\tMedian GC') fout.write('\tMean scaffold length (bp)\tMedian scaffold length (bp)') fout.write('\tMean: ' + '\tMean: '.join(self.coverage_headers)) fout.write('\tMedian: ' + '\tMedian: '.join(self.coverage_headers)) fout.write('\t' + '\t'.join(self.signature_headers)) fout.write('\n') for genome_id in alphanumeric_sort(self.genome_stats.keys()): stats = self.genome_stats[genome_id] fout.write(genome_id) fout.write('\t%d' % stats.genome_size) fout.write('\t%.2f' % stats.mean_gc) fout.write('\t%.2f' % stats.median_gc) fout.write('\t%.2f' % stats.mean_scaffold_length) fout.write('\t%.2f' % stats.median_scaffold_length) for cov in stats.mean_coverage: fout.write('\t%.2f' % cov) for cov in stats.median_coverage: fout.write('\t%.2f' % cov) for freq in stats.mean_signature: fout.write('\t%f' % freq) fout.write('\n') fout.close()
def _parse_data(self, infile): data = {} with open(infile) as fp: fp.readline() genomes = set() for line in fp: fields = line.rstrip().split('\t') fields[0] = re.sub(r'_genes$', "", fields[0]) fields[2] = re.sub(r'_genes$', "", fields[2]) genomes.add(fields[0]) genomes.add(fields[2]) try: data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])] except KeyError: data[fields[0]] = {} data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])] except IndexError as e: print(fields) raise e self.perc_ids = np_zeros([len(genomes), len(genomes)]) self.perc_aln = np_zeros([len(genomes), len(genomes)]) genome_to_index = {} self.genomes = [None] * len(genomes) for n, g in enumerate(alphanumeric_sort(genomes)): genome_to_index[g] = n self.genomes[n] = g self.genomes = np_array(self.genomes) for g1, g2 in permutations(genomes, 2): try: self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g1][g2][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1] except: self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g2][g1][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]
def plot(self, common_bases, quality, output_plot): """Create plot. Parameters ---------- common_bases : d[unitem bid][binning method] -> percent common bases Percentage of common bases for each binning method. quality : d[unitem id] -> (completeness, contamination) Completeness and contamination of bins. output_plot : str Desired output file. """ # sort bin labels bin_labels = alphanumeric_sort(common_bases) # sort binning method labels binning_methods = set() for bid in common_bases: for bm in common_bases[bid]: binning_methods.add(bm) bm_labels = alphanumeric_sort(binning_methods) # setup SVG drawing table_start_x = 0 table_start_y = 0 if not output_plot.endswith('.svg'): output_plot += '.svg' self.fig_size_x = table_start_x + len(bm_labels) * self.col_width self.fig_size_x += 2 * self.qual_col_width + 0.5 * self.col_width self.fig_size_y = table_start_y + len(bin_labels) * self.row_height dwg = svgwrite.Drawing(filename=output_plot, size=(self.fig_size_x, self.fig_size_y), profile='full', style='font-family:Arial') dwg.set_desc(title='UniteM shared base pair plot.') # render legend self._render_legend(dwg) # render binning method labels label_start_x = table_start_x label_start_y = table_start_y + 0.5 * self.row_height + 0.45 * self.font_size self._render_label_col(dwg, bin_labels, label_start_x, label_start_y, 'bin_labels') # render completeness and contamination header_start_y = table_start_y - 0.5 * self.font_size label_start_x = table_start_x + 0.5 * self.col_width label_start_y = table_start_y + 0.5 * self.row_height + 0.45 * self.font_size self._render_genome_quality_cols(dwg, bin_labels, quality, header_start_y, label_start_x, label_start_y) # move start of table to account for genome quality info table_start_x += 2 * self.qual_col_width + 0.5 * self.col_width # write header line label_start_x = table_start_x header_row_start_y = table_start_y - 0.5 * self.font_size self._render_label_row(dwg, bm_labels, label_start_x, header_row_start_y, 'binning_method_labels') # render gene complement row self._render_row(dwg, bin_labels, bm_labels, common_bases, table_start_x, table_start_y) dwg.save()
class Heatmap(AbstractPlot): def __init__(self, infile, outfile): AbstractPlot.__init__(self, None) self.outfile = outfile self.genomes = None self._parse_data(infile) self.colormap = pylab.cm.bwr self.discreteColourMap = ListedColormap([ (141 / 255.0, 211 / 255.0, 199 / 255.0), (255 / 255.0, 255 / 255.0, 179 / 255.0), (190 / 255.0, 186 / 255.0, 218 / 255.0), (251 / 255.0, 128 / 255.0, 114 / 255.0), (128 / 255.0, 177 / 255.0, 211 / 255.0), (253 / 255.0, 180 / 255.0, 98 / 255.0), (179 / 255.0, 222 / 255.0, 105 / 255.0), (252 / 255.0, 205 / 255.0, 229 / 255.0), (217 / 255.0, 217 / 255.0, 217 / 255.0), (188 / 255.0, 128 / 255.0, 189 / 255.0), (204 / 255.0, 235 / 255.0, 197 / 255.0), (255 / 255.0, 237 / 255.0, 111 / 255.0) ]) def _parse_data(self, infile): data = {} with open(infile) as fp: fp.readline() genomes = set() for line in fp: fields = line.rstrip().split('\t') fields[0] = re.sub(r'_genes$', "", fields[0]) fields[2] = re.sub(r'_genes$', "", fields[2]) genomes.add(fields[0]) genomes.add(fields[2]) try: data[fields[0]][fields[2]] = [ float(fields[5]), float(fields[7]) ] except KeyError: data[fields[0]] = {} data[fields[0]][fields[2]] = [ float(fields[5]), float(fields[7]) ] except IndexError, e: print fields raise e self.perc_ids = np_zeros([len(genomes), len(genomes)]) self.perc_aln = np_zeros([len(genomes), len(genomes)]) genome_to_index = {} self.genomes = [None] * len(genomes) for n, g in enumerate(alphanumeric_sort(genomes)): genome_to_index[g] = n self.genomes[n] = g self.genomes = np_array(self.genomes) for g1, g2 in permutations(genomes, 2): try: self.perc_ids[genome_to_index[g1]][ genome_to_index[g2]] = 100.0 - data[g1][g2][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1] except: self.perc_ids[genome_to_index[g1]][ genome_to_index[g2]] = 100.0 - data[g2][g1][0] self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]