Example #1
0
    def _write_genome_summary(self, output_file):
        """Summarize classification of each genome.

        Parameters
        ----------
        output_file : str
            Output file.
        """

        fout = open(output_file, 'w')
        fout.write('Genome id\tLength (bp)\t# sequences')
        for rank in self.rank_labels:
            fout.write('\t' + rank + ': taxa')
            fout.write('\t' + rank + ': percent of bps')
            fout.write('\t' + rank + ': percent of sequences')
            fout.write('\t' + rank + ': avg. evalue')
            fout.write('\t' + rank + ': avg. perc identity')
            fout.write('\t' + rank + ': avg. align length (AA)')
        fout.write('\n')

        sorted_genome_ids = alphanumeric_sort(self.profiles.keys())
        for genome_id in sorted_genome_ids:
            self.profiles[genome_id].write_genome_summary(fout)

        fout.close()
Example #2
0
    def _write_genome_summary(self, output_file):
        """Summarize classification of each genome.

        Parameters
        ----------
        output_file : str
            Output file.
        """

        fout = open(output_file, 'w')
        fout.write('Genome id\tLength (bp)\t# sequences')
        for rank in self.rank_labels:
            fout.write('\t' + rank + ': taxa')
            fout.write('\t' + rank + ': percent of bps')
            fout.write('\t' + rank + ': percent of sequences')
            fout.write('\t' + rank + ': avg. evalue')
            fout.write('\t' + rank + ': avg. perc identity')
            fout.write('\t' + rank + ': avg. align length (AA)')
        fout.write('\n')

        sorted_genome_ids = alphanumeric_sort(self.profiles.keys())
        for genome_id in sorted_genome_ids:
            self.profiles[genome_id].write_genome_summary(fout)

        fout.close()
Example #3
0
    def create_html_index(self, plot_dir, genome_plots):
        """Create HTML index for navigating outlier plots.

        Parameters
        ----------
        plot_dir : str
          Directory containing plots.
        genome_plots : d[genome_id] -> [(plot_type, plot_filename), ...]
          Hash indicating the plot types and filenames for each genome of interest.
        """

        sorted_genome_ids = alphanumeric_sort(genome_plots.keys())

        starting_plot_filename = genome_plots[sorted_genome_ids[0]][0][1]
        starting_plot_str = sorted_genome_ids[0] + '<br>' + genome_plots[sorted_genome_ids[0]][0][0]

        fout = open(os.path.join(plot_dir, 'index.html'), 'w')
        fout.write('<html>\n')
        fout.write('<head>')
        fout.write('<title>RefineM outlier plots</title>\n')
        fout.write('</head>\n')
        fout.write('<frameset cols="15%,85%">\n')
        fout.write('<frame src="plot_menu.html" name="menu">\n')
        fout.write('<frame src="%s" name="plot">\n' % starting_plot_filename)
        fout.write('</frameset>\n')
        fout.write('</html>\n')
        fout.close()

        fout = open(os.path.join(plot_dir, 'plot_menu.html'), 'w')
        fout.write('<html>\n')
        fout.write('<script>\n')
        fout.write('    function change_title(name) {\n')
        fout.write('        document.getElementById("active_plot").innerHTML = name;\n')
        fout.write('    }\n')
        fout.write('</script>\n\n')
        fout.write('<style>\n')
        fout.write('ul {\n')
        fout.write('margin-top: 0px;\n')
        fout.write('margin-bottom: 12px;\n')
        fout.write('}\n')
        fout.write('</style>\n\n')
        fout.write('<body>\n')
        fout.write('<div><b>Active plot:</b>\n')
        fout.write('<div id="active_plot">%s</div>\n' % starting_plot_str)
        fout.write('</div>\n')
        fout.write('<br>\n')
        fout.write('<div><b>Plots:</b></div>\n')
        for genome_id in sorted_genome_ids:
            fout.write('<i>  %s:</i>\n' % genome_id)
            fout.write('    <ul>\n')
            for (plot_type, plot_filename) in genome_plots[genome_id]:
                fout.write('    <li><a href="%s" target="plot" onclick="change_title(\'%s\');">%s</a><br></li>\n' % (plot_filename,
                                                                                                                     genome_id + '<br>' + plot_type,
                                                                                                                     plot_type))
            fout.write('    </ul>\n')
        fout.write('</body>\n')
        fout.write('</html>\n')
        fout.close()
Example #4
0
    def create_html_index(self, plot_dir, genome_plots):
        """Create HTML index for navigating outlier plots.

        Parameters
        ----------
        plot_dir : str
          Directory containing plots.
        genome_plots : d[genome_id] -> [(plot_type, plot_filename), ...]
          Hash indicating the plot types and filenames for each genome of interest.
        """

        sorted_genome_ids = alphanumeric_sort(genome_plots.keys())

        starting_plot_filename = genome_plots[sorted_genome_ids[0]][0][1]
        starting_plot_str = sorted_genome_ids[0] + '<br>' + genome_plots[sorted_genome_ids[0]][0][0]

        fout = open(os.path.join(plot_dir, 'index.html'), 'w')
        fout.write('<html>\n')
        fout.write('<head>')
        fout.write('<title>RefineM outlier plots</title>\n')
        fout.write('</head>\n')
        fout.write('<frameset cols="15%,85%">\n')
        fout.write('<frame src="plot_menu.html" name="menu">\n')
        fout.write('<frame src="%s" name="plot">\n' % starting_plot_filename)
        fout.write('</frameset>\n')
        fout.write('</html>\n')
        fout.close()

        fout = open(os.path.join(plot_dir, 'plot_menu.html'), 'w')
        fout.write('<html>\n')
        fout.write('<script>\n')
        fout.write('    function change_title(name) {\n')
        fout.write('        document.getElementById("active_plot").innerHTML = name;\n')
        fout.write('    }\n')
        fout.write('</script>\n\n')
        fout.write('<style>\n')
        fout.write('ul {\n')
        fout.write('margin-top: 0px;\n')
        fout.write('margin-bottom: 12px;\n')
        fout.write('}\n')
        fout.write('</style>\n\n')
        fout.write('<body>\n')
        fout.write('<div><b>Active plot:</b>\n')
        fout.write('<div id="active_plot">%s</div>\n' % starting_plot_str)
        fout.write('</div>\n')
        fout.write('<br>\n')
        fout.write('<div><b>Plots:</b></div>\n')
        for genome_id in sorted_genome_ids:
            fout.write('<i>  %s:</i>\n' % genome_id)
            fout.write('    <ul>\n')
            for (plot_type, plot_filename) in genome_plots[genome_id]:
                fout.write('    <li><a href="%s" target="plot" onclick="change_title(\'%s\');">%s</a><br></li>\n' % (plot_filename,
                                                                                                                     genome_id + '<br>' + plot_type,
                                                                                                                     plot_type))
            fout.write('    </ul>\n')
        fout.write('</body>\n')
        fout.write('</html>\n')
        fout.close()
Example #5
0
    def create(self, profiles, output_file):
        """Create Krona plot.

        Profiles for multiple items (e.g., genome, metagenome) can
        be specified. The complete hierarchy for each unique element
        should be specified as a semicolon separated string, e.g.,

            k__Bacteria;c__Firmicutes;...;s__

        The number of hits to each unique element is specified in
        the profiles dictionary, e.g.,

            d[unique_id][element_str] = 10

        Parameters
        ----------
        profiles: d[unique_id][element_str] -> count
            Number of hits to specific elements for each item.
        output_file : str
            Name of output file.
        """

        # create temporary files for each item
        cmd = 'ktImportText -o %s' % output_file
        tmp_dir = tempfile.mkdtemp()
        for unique_id in alphanumeric_sort(list(profiles.keys())):
            tmp_file = os.path.join(tmp_dir, unique_id)
            fout = open(tmp_file, 'w')
            for element_str, num_hits in profiles[unique_id].items():
                elements = [x.strip() for x in element_str.split(';')]
                fout.write(str(num_hits) + '\t' + '\t'.join(elements) + '\n')
            fout.close()

            cmd += ' %s,%s' % (tmp_file, unique_id)

        # create krona plot
        execute.run(cmd)

        # clean up temporary files
        shutil.rmtree(tmp_dir)
Example #6
0
    def write(self, output_file):
        """Write genome statistics to file.

        Parameters
        ----------
        output_file : str
            Name of output file.
        """

        fout = open(output_file, 'w')
        fout.write('Genome id\tGenome size (bp)')
        fout.write('\tMean GC\tMedian GC')
        fout.write('\tMean scaffold length (bp)\tMedian scaffold length (bp)')
        fout.write('\tMean: ' + '\tMean: '.join(self.coverage_headers))
        fout.write('\tMedian: ' + '\tMedian: '.join(self.coverage_headers))
        fout.write('\t' + '\t'.join(self.signature_headers))
        fout.write('\n')

        for genome_id in alphanumeric_sort(self.genome_stats.keys()):
            stats = self.genome_stats[genome_id]

            fout.write(genome_id)
            fout.write('\t%d' % stats.genome_size)
            fout.write('\t%.2f' % stats.mean_gc)
            fout.write('\t%.2f' % stats.median_gc)
            fout.write('\t%.2f' % stats.mean_scaffold_length)
            fout.write('\t%.2f' % stats.median_scaffold_length)

            for cov in stats.mean_coverage:
                fout.write('\t%.2f' % cov)

            for cov in stats.median_coverage:
                fout.write('\t%.2f' % cov)

            for freq in stats.mean_signature:
                fout.write('\t%f' % freq)

            fout.write('\n')

        fout.close()
Example #7
0
    def _parse_data(self, infile):
        data = {}
        with open(infile) as fp:
            fp.readline()
            genomes = set()
            for line in fp:
                fields = line.rstrip().split('\t')
                fields[0] = re.sub(r'_genes$', "", fields[0])
                fields[2] = re.sub(r'_genes$', "", fields[2])
                genomes.add(fields[0])
                genomes.add(fields[2])
                try:
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except KeyError:
                    data[fields[0]] = {}
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except IndexError as e:
                    print(fields)
                    raise e

        self.perc_ids = np_zeros([len(genomes), len(genomes)])
        self.perc_aln = np_zeros([len(genomes), len(genomes)])
        genome_to_index = {}
        self.genomes = [None] * len(genomes)
        for n, g in enumerate(alphanumeric_sort(genomes)):
            genome_to_index[g] = n
            self.genomes[n] = g

        self.genomes = np_array(self.genomes)
        for g1, g2 in permutations(genomes, 2):
            try:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g1][g2][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1]
            except:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g2][g1][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]
Example #8
0
    def _parse_data(self, infile):
        data = {}
        with open(infile) as fp:
            fp.readline()
            genomes = set()
            for line in fp:
                fields = line.rstrip().split('\t')
                fields[0] = re.sub(r'_genes$', "", fields[0])
                fields[2] = re.sub(r'_genes$', "", fields[2])
                genomes.add(fields[0])
                genomes.add(fields[2])
                try:
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except KeyError:
                    data[fields[0]] = {}
                    data[fields[0]][fields[2]] = [float(fields[5]), float(fields[7])]
                except IndexError as e:
                    print(fields)
                    raise e

        self.perc_ids = np_zeros([len(genomes), len(genomes)])
        self.perc_aln = np_zeros([len(genomes), len(genomes)])
        genome_to_index = {}
        self.genomes = [None] * len(genomes)
        for n, g in enumerate(alphanumeric_sort(genomes)):
            genome_to_index[g] = n
            self.genomes[n] = g

        self.genomes = np_array(self.genomes)
        for g1, g2 in permutations(genomes, 2):
            try:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g1][g2][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g1][g2][1]
            except:
                self.perc_ids[genome_to_index[g1]][genome_to_index[g2]] = 100.0 - data[g2][g1][0]
                self.perc_aln[genome_to_index[g1], genome_to_index[g2]] = data[g2][g1][1]
Example #9
0
    def plot(self, common_bases, quality, output_plot):
        """Create plot.
        
        Parameters
        ----------
        common_bases : d[unitem bid][binning method] -> percent common bases
          Percentage of common bases for each binning method.
        quality : d[unitem id] -> (completeness, contamination)
          Completeness and contamination of bins.
        output_plot : str
          Desired output file.
        """

        # sort bin labels
        bin_labels = alphanumeric_sort(common_bases)

        # sort binning method labels
        binning_methods = set()
        for bid in common_bases:
            for bm in common_bases[bid]:
                binning_methods.add(bm)
        bm_labels = alphanumeric_sort(binning_methods)

        # setup SVG drawing
        table_start_x = 0
        table_start_y = 0

        if not output_plot.endswith('.svg'):
            output_plot += '.svg'

        self.fig_size_x = table_start_x + len(bm_labels) * self.col_width
        self.fig_size_x += 2 * self.qual_col_width + 0.5 * self.col_width
        self.fig_size_y = table_start_y + len(bin_labels) * self.row_height

        dwg = svgwrite.Drawing(filename=output_plot,
                               size=(self.fig_size_x, self.fig_size_y),
                               profile='full',
                               style='font-family:Arial')
        dwg.set_desc(title='UniteM shared base pair plot.')

        # render legend
        self._render_legend(dwg)

        # render binning method labels
        label_start_x = table_start_x
        label_start_y = table_start_y + 0.5 * self.row_height + 0.45 * self.font_size
        self._render_label_col(dwg, bin_labels, label_start_x, label_start_y,
                               'bin_labels')

        # render completeness and contamination
        header_start_y = table_start_y - 0.5 * self.font_size
        label_start_x = table_start_x + 0.5 * self.col_width
        label_start_y = table_start_y + 0.5 * self.row_height + 0.45 * self.font_size
        self._render_genome_quality_cols(dwg, bin_labels, quality,
                                         header_start_y, label_start_x,
                                         label_start_y)

        # move start of table to account for genome quality info
        table_start_x += 2 * self.qual_col_width + 0.5 * self.col_width

        # write header line
        label_start_x = table_start_x
        header_row_start_y = table_start_y - 0.5 * self.font_size
        self._render_label_row(dwg, bm_labels, label_start_x,
                               header_row_start_y, 'binning_method_labels')

        # render gene complement row
        self._render_row(dwg, bin_labels, bm_labels, common_bases,
                         table_start_x, table_start_y)

        dwg.save()
Example #10
0
class Heatmap(AbstractPlot):
    def __init__(self, infile, outfile):
        AbstractPlot.__init__(self, None)

        self.outfile = outfile
        self.genomes = None
        self._parse_data(infile)

        self.colormap = pylab.cm.bwr

        self.discreteColourMap = ListedColormap([
            (141 / 255.0, 211 / 255.0, 199 / 255.0),
            (255 / 255.0, 255 / 255.0, 179 / 255.0),
            (190 / 255.0, 186 / 255.0, 218 / 255.0),
            (251 / 255.0, 128 / 255.0, 114 / 255.0),
            (128 / 255.0, 177 / 255.0, 211 / 255.0),
            (253 / 255.0, 180 / 255.0, 98 / 255.0),
            (179 / 255.0, 222 / 255.0, 105 / 255.0),
            (252 / 255.0, 205 / 255.0, 229 / 255.0),
            (217 / 255.0, 217 / 255.0, 217 / 255.0),
            (188 / 255.0, 128 / 255.0, 189 / 255.0),
            (204 / 255.0, 235 / 255.0, 197 / 255.0),
            (255 / 255.0, 237 / 255.0, 111 / 255.0)
        ])

    def _parse_data(self, infile):
        data = {}
        with open(infile) as fp:
            fp.readline()
            genomes = set()
            for line in fp:
                fields = line.rstrip().split('\t')
                fields[0] = re.sub(r'_genes$', "", fields[0])
                fields[2] = re.sub(r'_genes$', "", fields[2])
                genomes.add(fields[0])
                genomes.add(fields[2])
                try:
                    data[fields[0]][fields[2]] = [
                        float(fields[5]), float(fields[7])
                    ]
                except KeyError:
                    data[fields[0]] = {}
                    data[fields[0]][fields[2]] = [
                        float(fields[5]), float(fields[7])
                    ]
                except IndexError, e:
                    print fields
                    raise e

        self.perc_ids = np_zeros([len(genomes), len(genomes)])
        self.perc_aln = np_zeros([len(genomes), len(genomes)])
        genome_to_index = {}
        self.genomes = [None] * len(genomes)
        for n, g in enumerate(alphanumeric_sort(genomes)):
            genome_to_index[g] = n
            self.genomes[n] = g

        self.genomes = np_array(self.genomes)
        for g1, g2 in permutations(genomes, 2):
            try:
                self.perc_ids[genome_to_index[g1]][
                    genome_to_index[g2]] = 100.0 - data[g1][g2][0]
                self.perc_aln[genome_to_index[g1],
                              genome_to_index[g2]] = data[g1][g2][1]
            except:
                self.perc_ids[genome_to_index[g1]][
                    genome_to_index[g2]] = 100.0 - data[g2][g1][0]
                self.perc_aln[genome_to_index[g1],
                              genome_to_index[g2]] = data[g2][g1][1]