def get_overlap_threshold(inputs, pp_flag, G=[]):
    if pp_flag:
        comps = preprocess_complexes(inputs['dir_nm'] + inputs['comf_nm'], ' ',
                                     G)
        comps = [set(list(g.nodes)) for g in comps]
    else:
        with open(inputs['dir_nm'] + inputs['comf_nm']) as f:
            comps = [set(line.rstrip().split()) for line in f.readlines()]

    n_comps = len(comps)
    jcs = []
    for i in range(n_comps):
        for j in range(i + 1, n_comps):
            jc = jaccard_coeff(comps[i], comps[j])
            jcs.append(jc)

    jcs = [jc for jc in jcs if jc != 0]

    if len(jcs) == 0:
        return 0

    q1 = np_percentile(jcs, 25)
    q2 = np_percentile(jcs, 50)

    return float(q1 + 0.5 * (q2 - q1))
def get_overlap_threshold_qi(inputs, pp_flag, G=[]):
    if pp_flag:
        comps = preprocess_complexes(inputs['dir_nm'] + inputs['comf_nm'], ' ',
                                     G)
        comps = [set(list(g.nodes)) for g in comps]
    else:
        with open(inputs['dir_nm'] + inputs['comf_nm']) as f:
            comps = [set(line.rstrip().split()) for line in f.readlines()]

    n_comps = len(comps)
    jcs = []
    for i in range(n_comps):
        for j in range(i + 1, n_comps):
            jc = NA_threshold(comps[i], comps[j])
            jcs.append(jc)

    jcs = [jc for jc in jcs if jc != 0]

    if len(jcs) == 0:
        return 0

    q1 = np_percentile(jcs, 25)
    min_jc = np_percentile(jcs, 2)

    return float(min_jc + (q1 - min_jc) / 2.0)


#inputs = dict()
##inputs['dir_nm'] = 'humap'
##inputs['comf_nm'] = '/res_train_complexes_new_73_more.txt'
#
#inputs['dir_nm'] = 'yeast'
#inputs['comf_nm'] = '/TAP-MS.txt'
##inputs['comf_nm'] = '/mips.txt'
#
##inputs['dir_nm'] = 'toy_network'
##inputs['comf_nm'] = '/train_complexes.txt'
#
#pp_flag = 1
#inputs['graph_files_dir']='/graph_files'
#myGraphName = inputs['dir_nm'] + inputs['graph_files_dir']+ "/res_myGraph"
#with open(myGraphName, 'rb') as f:
#    myGraph = pickle_load(f)
#
#sol1 = get_overlap_threshold(inputs,pp_flag,myGraph)
#sol2=get_overlap_threshold_qi(inputs,pp_flag,myGraph)
#print(sol1)
#print(sol2)
Example #3
0
    def __writer(self, num_species, output_dir, writer_queue):
        """Write results for each species."""
        
        # gather results for each genome
        output_file = os.path.join(output_dir, 'ani_species.tsv')
        fout = open(output_file, 'w')
        fout.write('Species\tNo. Sampled Genomes\tMean ANI\tMedian ANI\t5th Percentile\t95th Percentile')
        fout.write('\tMean AF\tMedian AF\t5th Percentile\t95th Percentile')
        fout.write('\tSampled Genomes\n')
        
        output_file = os.path.join(output_dir, 'ani.tsv')
        fout_pw = open(output_file, 'w')
        fout_pw.write('Species\tGenome 1\tGenome 2\tANI(1->2)\tANI(2->1)\tAF(1->2)\tAF(2->1)\n')
        processed = 0
        while True:
            species, ani, af, genome_ids, results = writer_queue.get(block=True, timeout=None)
            if species == None:
              break

            processed += 1
            statusStr = 'Finished processing %d of %d (%.2f%%) species.' % (processed,
                                                                            num_species,
                                                                            float(processed) * 100 / num_species)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            fout_pw.write(results)
            
            row = '%s\t%d' % (species, len(genome_ids))
            mean_ani = np_mean(ani)
            p5, median, p95 = np_percentile(ani, [5, 50, 95])
            row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_ani,
                                                    median,
                                                    p5, p95)
            mean_af = np_mean(af)
            p5, median, p95 = np_percentile(af, [5, 50, 95])
            row += '\t%.2f\t%.2f\t%.2f\t%.2f' % (mean_af*100,
                                                    median*100,
                                                    p5*100, p95*100)
            fout.write('%s\t%s\n' % (row, ','.join(genome_ids)))

        sys.stdout.write('\n')

        fout.close()
        fout_pw.close()
Example #4
0
    def _gene_distribution(self, seq_file):
        """Calculate length distribution of sequences."""

        gene_lens = []
        for seq_id, seq in seq_io.read_seq(seq_file):
            gene_lens.append(len(seq))

        p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90])

        return np_mean(gene_lens), max(gene_lens), min(
            gene_lens), p10, p50, p90
Example #5
0
    def _render_collapsed_rectangular(self, node, collapsed_group):
        """Render collapsed lineage in rectangular tree."""

        # get length of branches
        leaf_dists = []
        for leaf in node.preorder_iter(lambda n: n.is_leaf()):
            leaf_dists.append(dist_to_ancestor(leaf, node))

        branch1, branch2 = np_percentile(leaf_dists, [
            self.collapse_branch1_percentile, self.collapse_branch2_percentile
        ])
        branch1 = (branch1 / self.deepest_node) * self.height
        branch2 = (branch2 / self.deepest_node) * self.height

        if self.collapse_display_method == 'TRIANGLE':
            branch2 = 0

        # render collapsed lineage
        _support, taxon, _aux_info = parse_label(node.label)
        lineage_name, color, alpha, stroke_width, stroke_color = self.collapse_map[
            node]

        pts = []
        pts.append((node.x, node.y + 0.5 * node.collapsed_height))
        pts.append((node.x, node.y - 0.5 * node.collapsed_height))
        pts.append((node.x + branch1, node.y - 0.5 * node.collapsed_height))
        pts.append((node.x + branch2, node.y + 0.5 * node.collapsed_height))

        p = self.dwg.polygon(points=pts)
        p.fill(color=color, opacity=alpha)
        p.stroke(color=stroke_color, width=stroke_width)
        collapsed_group.add(p)

        if self.collapse_show_labels:
            if self.collapse_label_position == 'INTERNAL':
                label_x = node.x + 0.01 * self.inch
            elif self.collapse_label_position == 'EXTERNAL':
                label_x = max(node.x + branch1, node.x + branch2)

            label = lineage_name
            if self.collapse_show_leaf_count:
                label += ' [%d]' % len(leaf_dists)

            render_label(self.dwg, label_x, node.y, 0, label,
                         self.collapse_font_size, self.collapse_font_color,
                         collapsed_group)
Example #6
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file,
            output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        input_tree_name = os.path.splitext(os.path.basename(input_tree))[0]

        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir,
                                         '%s.taxonomy.tsv' % input_tree_name)
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)

        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)

        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(
            tree, taxonomy, set(), min_children, -1)

        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue

            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue

            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]

            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)

            for n in node.leaf_iter():
                dist_to_node = self._dist_to_ancestor(n, node)

                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[
                Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))

        # report number of taxa at each rank
        print('')
        print('Rank\tTaxa\tTaxa for Inference')
        for rank, taxa in taxa_at_rank.items():
            taxa_for_inference = [
                x for x in taxa if x in taxa_for_dist_inference
            ]
            print('%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa),
                                  len(taxa_for_inference)))
        print('')

        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)

            sorted_taxon += sorted(taxa_at_rank)

        # report results for each named group
        taxa_file = os.path.join(output_dir,
                                 '%s.taxa_bl_dist.tsv' % input_tree_name)
        fout = open(taxa_file, 'w')
        fout.write(
            'Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n'
        )
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write(
                '%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                (taxon, str(taxon in taxa_for_dist_inference), np_mean(dist),
                 np_std(dist), p[0], p[1], p[2], p[3], p[4]))
        fout.close()

        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir,
                                 '%s.rank_bl_dist.tsv' % input_tree_name)
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' %
                       (rank, np_mean(dist), np_std(dist), p[0], p[1], p[2],
                        p[3], p[4]))
        fout.close()

        # report results for each node
        output_bl_file = os.path.join(output_dir,
                                      '%s.node_bl_dist.tsv' % input_tree_name)
        self._write_bl_dist(tree, output_bl_file)
Example #7
0
    def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            mono_max_count = max(np_histogram(mono, bins=bins)[0])
            mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n, b, p = ax.hist(mono, bins=bins,
                      color=(0.0, 0.0, 1.0),
                      alpha=0.25,
                      weights=0.9 * w * mono_weights,
                      bottom=i,
                      lw=0,
                      zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Example #8
0
    def _distribution_plot(self, rel_dists, rel_dist_thresholds,
                           taxa_for_dist_inference, distribution_table,
                           plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].iteritems()
                if taxa in taxa_for_dist_inference
            ]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write(
            'Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n'
        )
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff
                                        and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' %
                           tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[
                0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001,
                    y_max,
                    '%.3f' % threshold,
                    horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)
    def _distribution_plot(self, rel_dists, rel_dist_thresholds, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        rel_dist_thresholds: list
            Relative distances cutoffs for defining ranks.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), 'r-', zorder=2)
            ax.plot((p90, p90), (i, i + 0.5), 'r-', zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tRank cutoff\tRank outlier\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        rel_dist_thresholds += [1.0]  # append boundry for species
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))

            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if clade_label in taxa_for_dist_inference:
                    c.append((0.0, 0.0, 0.5))
                else:
                    c.append((0.5, 0.5, 0.5))

                p10, p50, p90 = percentiles[i]
                percentile_outlier = not (dist >= p10 and dist <= p90)

                if i == 0:
                    rank_cutoff = rel_dist_thresholds[i]
                    rank_outlier = dist > rank_cutoff
                else:
                    rank_cutoff = rel_dist_thresholds[i]
                    upper_rank_cutoff = rel_dist_thresholds[i - 1]
                    rank_outlier = not (dist >= upper_rank_cutoff and dist <= rank_cutoff)

                v = [clade_label, dist, rank_cutoff, str(rank_outlier)]
                v += percentiles[i] + [str(percentile_outlier)]
                fout.write('%s\t%.2f\t%.2f\t%s\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        fout.close()

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # plot relative divergence threshold lines
        y_min, y_max = ax.get_ylim()
        for threshold in rel_dist_thresholds[0:-1]:  # don't draw species boundary
            ax.plot((threshold, threshold), (y_min, y_max), color='r', ls='--')
            ax.text(threshold + 0.001, y_max, '%.3f' % threshold, horizontalalignment='center')

        # make plot interactive
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=96)
Example #10
0
def feature_extract(inputs, complex_graphs, test_complex_graphs, G):
    G_nodes = G.nodes()
    n_feats = inputs['feats']
    out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm']
    mode = inputs['mode']
    # mode = "non_gen" # Change to gen if you want to generate matrices

    # n_pos = len(complex_graphs)
    sizes = [len(comp) for comp in complex_graphs]

    # get quartiles
    q1 = np_percentile(sizes, 25)
    q3 = np_percentile(sizes, 75)
    max_wo_outliers = math_ceil(q3 + 4.5 *
                                (q3 - q1))  # Maximum after removing outliers

    max_size_train = max(sizes)
    recommended_max_size = min(max_size_train, max_wo_outliers)

    max_sizeF = inputs['dir_nm'] + inputs[
        'train_test_files_dir'] + "/res_max_size_search"
    with open(max_sizeF, 'wb') as f:
        pickle_dump(recommended_max_size, f)

    # n_pos_test = len(test_complex_graphs)
    sizes_test = [len(comp) for comp in test_complex_graphs]
    max_size_test = max(sizes_test)

    fig = plt.figure()
    # Plot box plot of sizes to know the outliers (for setting step size in sampling)
    sns_boxplot(sizes)
    plt.xlabel("Size")
    plt.title("Size distribution of training complexes")
    plt.savefig(out_comp_nm + "_known_train_size_dist_box_plot")
    plt.close(fig)

    fig = plt.figure()
    # Plot box plot of sizes to know the outliers (for setting step size in sampling)
    sns_boxplot(sizes + sizes_test)
    plt.xlabel("Size")
    plt.title("Size distribution of known complexes")
    plt.savefig(out_comp_nm + "_known_size_dist_box_plot")
    plt.close(fig)

    if inputs[
            'model_type'] == "tpot" and mode == "non_gen":  # CHANGE X_POS, Y_POS later !!!!
        logging_info("Reading labeled feature matrix from file...")
        # Read X,y from csv file

        y, X, X_pos, y_pos, X_neg, y_neg = read_from_csv(
            inputs['train_feat_mat'])

        y_test, X_test, X_pos_test, y_pos_test, X_neg_test, y_neg_test = read_from_csv(
            inputs['test_feat_mat'])

        logging_info("Finished reading feature matrix")
    else:

        logging_info("Feature extraction...")

        feat_list = [
            "dens", "nodes", "degree_max", "degree_mean", "degree_median",
            "degree_var", "CC_max", "CC_mean", "CC_var", "edge_wt_mean",
            "edge_wt_max", "edge_wt_var", "DC_mean", "DC_var", "DC_max", "sv1",
            "sv2", "sv3", "complex"
        ]

        X_pos = create_feat_mat(complex_graphs, n_feats)
        X_pos_test = create_feat_mat(test_complex_graphs, n_feats)

        X_allpos = np_vstack((X_pos, X_pos_test))
        n_allpos = len(X_allpos)
        y, X, X_pos, y_pos, X_neg, y_neg = extract_features(
            out_comp_nm, 'train', max_size_train, inputs, G_nodes, feat_list,
            X_pos, X_allpos, n_allpos, sizes)
        y_test, X_test, X_pos_test, y_pos_test, X_neg_test, y_neg_test = extract_features(
            out_comp_nm, 'test', max_size_test, inputs, G_nodes, feat_list,
            X_pos_test, X_allpos, n_allpos, sizes_test)

        logging_info("Finished Feature extraction")
    return max_size_train, max_size_test, X_pos_test, X_neg_test, X_test, y_test, X_pos, y_pos, X, y, X_neg, y_neg
Example #11
0
def _do_combine(hdu_no: int, progress: float, progress_step: float,
                data_width: int, data_height: int,
                input_data: List[Union[pyfits.HDUList,
                                       Tuple[ndarray, pyfits.Header]]],
                mode: str = 'average', scaling: Optional[str] = None,
                rejection: Optional[str] = None, min_keep: int = 2,
                percentile: float = 50.0,
                lo: Optional[float] = None, hi: Optional[float] = None,
                max_mem_mb: float = 100.0,
                callback: Optional[callable] = None) \
        -> Tuple[Union[ndarray, ma.MaskedArray], float]:
    """
    Combine the given HDUs from all input images; used by :func:`combine` to
    get a stack of either all input images or, if lucky imaging is enabled,
    of their subset

    :return: image stack data and rejection percent
    """
    n = len(input_data)

    # Calculate scaling factors
    k_ref, k = None, []
    if scaling:
        for data_no, f in enumerate(input_data):
            if isinstance(f, pyfits.HDUList):
                data = f[hdu_no].data
            else:
                data = f[0]
            if scaling == 'average':
                k.append(data.mean())
            elif scaling == 'percentile':
                if percentile == 50:
                    k.append(
                        median(data) if not isinstance(data, ma.MaskedArray)
                        else ma.median(data))
                else:
                    k.append(
                        np_percentile(data, percentile)
                        if not isinstance(data, ma.MaskedArray)
                        else np_percentile(data.compressed(), percentile))
            elif scaling == 'mode':
                # Compute modal values from histograms; convert to integer
                # and assume 2 x 16-bit data range
                if isinstance(data, ma.MaskedArray):
                    data = data.compressed()
                else:
                    data = data.ravel()
                min_val = data.min(initial=0)
                k.append(
                    argmax(bincount(
                        (data - min_val).clip(0, 2*0x10000 - 1)
                        .astype(int32))) + min_val)
            else:
                raise ValueError(
                    'Unknown scaling mode "{}"'.format(scaling))
            if callback is not None:
                callback(progress + (data_no + 1)/n/2*progress_step)

        # Normalize to the first frame with non-zero average; keep images
        # with zero or same average as is
        k_ref = k[0]
        if not k_ref:
            for ki in k[1:]:
                if ki:
                    k_ref = ki
                    break

    # Process data in chunks to fit in the maximum amount of RAM allowed
    rowsize = 0
    for data in input_data:
        if isinstance(data, pyfits.HDUList):
            data = data[hdu_no].data
        else:
            data = data[0]
        rowsize += data[0].nbytes
        if rejection or isinstance(data, ma.MaskedArray):
            rowsize += data_width
    chunksize = min(max(int(max_mem_mb*(1 << 20)/rowsize), 1), data_height)
    while chunksize > 1:
        # Use as small chunks as possible but keep their total number
        if len(list(range(0, data_height, chunksize - 1))) > \
                len(list(range(0, data_height, chunksize))):
            break
        chunksize -= 1
    chunks = []
    rej_percent = 0
    for chunk in range(0, data_height, chunksize):
        datacube = [
            f[hdu_no].data[chunk:chunk + chunksize]
            if isinstance(f, pyfits.HDUList) else f[0][chunk:chunk + chunksize]
            for f in input_data
        ]
        if k_ref:
            # Scale data
            for data, ki in zip(datacube, k):
                if ki not in (0, k_ref):
                    data *= k_ref/ki

        # Reject outliers
        if rejection or any(isinstance(data, ma.MaskedArray)
                            for data in datacube):
            datacube = ma.masked_array(datacube)
            if not datacube.mask.shape:
                # No initially masked data, but we'll need an array instead
                # of mask=False to do slicing operations
                datacube.mask = full(datacube.shape, datacube.mask)
        else:
            datacube = array(datacube)

        if rejection == 'chauvenet':
            datacube.mask = chauvenet(datacube, min_vals=min_keep)
        elif rejection == 'iraf':
            if lo is None:
                lo = 1
            if hi is None:
                hi = 1
            if n - (lo + hi) < min_keep:
                raise ValueError(
                    'IRAF rejection with lo={}, hi={} would keep less than '
                    '{} values for a {}-image set'.format(lo, hi, min_keep, n))
            if lo or hi:
                # Mask "lo" smallest values and "hi" largest values along
                # the 0th axis
                order = datacube.argsort(0)
                mg = tuple(i.ravel() for i in indices(datacube.shape[1:]))
                for j in range(-hi, lo):
                    datacube.mask[(order[j].ravel(),) + mg] = True
                del order, mg
        elif rejection == 'minmax':
            if lo is not None and hi is not None:
                if lo > hi:
                    raise ValueError(
                        'lo={} > hi={} for minmax rejection'.format(lo, hi))
                datacube.mask[((datacube < lo) |
                               (datacube > hi)).nonzero()] = True
                if datacube.mask.all(0).any():
                    logging.warning(
                        '%d completely masked pixels left after minmax '
                        'rejection', datacube.mask.all(0).sum())
        elif rejection == 'sigclip':
            if lo is None:
                lo = 3
            if hi is None:
                hi = 3
            if lo < 0 or hi < 0:
                raise ValueError(
                    'Lower and upper limits for sigma clipping must be '
                    'positive, got lo={}, hi={}'.format(lo, hi))
            max_rej = n - min_keep
            while True:
                avg = datacube.mean(0)
                sigma = datacube.std(0)
                resid = datacube - avg
                outliers = (datacube.mask.sum(0) < max_rej) & \
                    (sigma > 0) & ((resid < -lo*sigma) | (resid > hi*sigma))
                if not outliers.any():
                    del avg, sigma, resid, outliers
                    break
                datacube.mask[outliers.nonzero()] = True
        elif rejection:
            raise ValueError(
                'Unknown rejection mode "{}"'.format(rejection))

        if isinstance(datacube, ma.MaskedArray):
            if datacube.mask is None or not datacube.mask.any():
                # Nothing was rejected
                datacube = datacube.data
            else:
                # Calculate the percentage of rejected pixels
                rej_percent += datacube.mask.sum()

        # Combine data
        if mode == 'average':
            res = datacube.mean(0)
        elif mode == 'sum':
            res = datacube.sum(0)
        elif mode == 'percentile':
            if percentile == 50:
                if isinstance(datacube, ma.MaskedArray):
                    res = ma.median(datacube, 0)
                else:
                    res = median(datacube, 0)
            else:
                if isinstance(datacube, ma.MaskedArray):
                    res = nanpercentile(
                        datacube.filled(nan), percentile, 0)
                else:
                    res = np_percentile(datacube, percentile, 0)
        else:
            raise ValueError('Unknown stacking mode "{}"'.format(mode))
        chunks.append(res)

        if callback is not None:
            callback(
                progress +
                ((0.5 if scaling else 0) +
                 min(chunk + chunksize, data_height)/data_height /
                 (2 if scaling else 1))*progress_step)

    if len(chunks) > 1:
        res = ma.vstack(chunks)
    else:
        res = chunks[0]
    if isinstance(res, ma.MaskedArray) and (
            res.mask is None or not res.mask.any()):
        res = res.data
    return res, rej_percent
def main():
    parser = argparse_ArgumentParser("Input parameters")
    parser.add_argument("--input_file_name", default="input_toy.yaml", help="Input parameters file name")
    parser.add_argument("--out_dir_name", default="/results", help="Output directory name")
    parser.add_argument("--train_test_files_dir", default="", help="Train test file path")    
    parser.add_argument("--graph_files_dir", default="", help="Graph files' folder path") 
    parser.add_argument("--seed_mode", help="Seed mode - specify 'cliques' for the cliques algo")
    parser.add_argument("--max_size_thres", help="Max size threshold")    
    parser.add_argument("--n_pts", default=1, help="number of partitions (computers)")
    args = parser.parse_args()

    with open(args.input_file_name, 'r') as f:
        inputs = yaml_load(f, yaml_Loader)

    if args.seed_mode:
        inputs['seed_mode'] = args.seed_mode
    if args.max_size_thres:
        inputs['max_size_thres'] = int(args.max_size_thres)        

    # Override output directory name if same as gen
    if args.out_dir_name or inputs['out_comp_nm'] == "/results/res":
        if not os_path.exists(inputs['dir_nm'] + args.out_dir_name):
            os_mkdir(inputs['dir_nm'] + args.out_dir_name)
        inputs['out_comp_nm'] = args.out_dir_name + "/res"
        
    inputs['train_test_files_dir'] = ''
    if args.train_test_files_dir:
        if not os_path.exists(inputs['dir_nm'] + args.train_test_files_dir):
            os_mkdir(inputs['dir_nm'] + args.train_test_files_dir)
        inputs['train_test_files_dir'] = args.train_test_files_dir    

    inputs['graph_files_dir'] = ''
    if args.graph_files_dir:
        if not os_path.exists(inputs['dir_nm'] + args.graph_files_dir):
            os_mkdir(inputs['dir_nm'] + args.graph_files_dir)
        inputs['graph_files_dir'] = args.graph_files_dir             

    with open(inputs['dir_nm'] + inputs['out_comp_nm'] + "_input_sample_partition.yaml", 'w') as outfile:
        yaml_dump(inputs, outfile, default_flow_style=False)

    logging_basicConfig(filename=inputs['dir_nm'] + inputs['out_comp_nm'] + "_logs.yaml", level=logging_INFO)
        
    neig_dicts_folder = inputs['dir_nm'] +inputs['graph_files_dir']+ "/neig_dicts"

    num_comp = inputs['num_comp']
    max_size_thres = inputs['max_size_thres']
    max_size_trainF = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_train"
    with open(max_size_trainF, 'rb') as f:
        max_size_train = pickle_load(f)

    max_size = max_size_train
    
    max_sizeF_feat = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_search"  
    if os_path.exists(max_sizeF_feat):
        with open(max_sizeF_feat, 'rb') as f:
            max_size = pickle_load(f)
    else:            
        with open(inputs['dir_nm'] + inputs['comf_nm']) as f:
            sizes = [len(line.rstrip().split()) for line in f.readlines()]    
        max_size = max(sizes)
        q1 = np_percentile(sizes, 25)
        q3 = np_percentile(sizes, 75)
        max_wo_outliers = math_ceil(q3 + 4.5*(q3-q1))  # Maximum after removing outliers    
        max_size = min(max_size,max_wo_outliers)
        
        
    if max_size >= max_size_thres:
        max_size = max_size_thres
        
    out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm']

    with open(out_comp_nm + '_metrics.out', "a") as fid:
        print("Max number of steps for complex growth = ", max_size, file=fid)  # NOT actual max size since you merge later
    
    max_sizeF = inputs['dir_nm'] + inputs['train_test_files_dir']+ "/res_max_size_search_par"
    
    with open(max_sizeF, 'wb') as f:
        pickle_dump(max_size, f)

    seed_mode = inputs['seed_mode']

    if seed_mode == "all_nodes":
        #graph_nodes = list(myGraph.nodes())
        seed_nodes = rand_perm(os_listdir(neig_dicts_folder))
    elif seed_mode == "n_nodes":
        seed_nodes = rand_perm(os_listdir(neig_dicts_folder))[:num_comp]
    elif seed_mode == "all_nodes_known_comp":
        protlistfname = inputs['dir_nm']+ inputs['train_test_files_dir'] + "/res_protlist"
        with open(protlistfname, 'rb') as f:
            prot_list = pickle_load(f)        
        seed_nodes = list(prot_list)
    elif seed_mode == "cliques":
        myGraphName = inputs['dir_nm'] + inputs['graph_files_dir']+ "/res_myGraph"
        with open(myGraphName, 'rb') as f:
            myGraph = pickle_load(f)        
        clique_list = list(nx_find_cliques(myGraph))
        to_rem = []
        # Removing 2 node and big complexes
        for comp in clique_list:
            if len(comp) <= 2 or len(comp) >= max_size:
                to_rem.append(comp)

        for comp in to_rem:
            clique_list.remove(comp)

        seed_nodes = clique_list  # Remove duplicates later.

    # partition
    ptns = int(args.n_pts)

    nc = len(seed_nodes)
    if seed_mode == 'n_nodes':
        seed_nodes_F = out_comp_nm + "_seed_nodes"
        each_ptn = nc // ptns
        for i in range(ptns - 1):
            with open(seed_nodes_F + str(i), 'wb') as f:
                pickle_dump(seed_nodes[i * each_ptn:(i + 1) * each_ptn], f)
        with open(seed_nodes_F + str(ptns - 1), 'wb') as f:
            pickle_dump(seed_nodes[(ptns - 1) * each_ptn:], f)
    else:
        seed_nodes_dir =  inputs['dir_nm'] + inputs['graph_files_dir']+ "/" + seed_mode + "_n_pts_" + str(ptns)

        if not os_path.exists(seed_nodes_dir):
            os_mkdir(seed_nodes_dir)
            seed_nodes_F = seed_nodes_dir + "/res_seed_nodes"
            each_ptn = nc // ptns
            for i in range(ptns - 1):
                with open(seed_nodes_F + str(i), 'wb') as f:
                    pickle_dump(seed_nodes[i * each_ptn:(i + 1) * each_ptn], f)

            with open(seed_nodes_F + str(ptns - 1), 'wb') as f:
                pickle_dump(seed_nodes[(ptns - 1) * each_ptn:], f)
Example #13
0
    def run(self, input_tree, trusted_taxa_file, min_children, taxonomy_file, output_dir):
        """Calculate distribution of branch lengths at each taxonomic rank.

        Parameters
        ----------
        input_tree : str
            Name of input tree.
        trusted_taxa_file : str
            File specifying trusted taxa to consider when inferring distribution. Set to None to consider all taxa.
        min_children : int
            Only consider taxa with at least the specified number of children taxa when inferring distribution.
        taxonomy_file : str
            File containing taxonomic information for leaf nodes (if NULL, read taxonomy from tree).
        output_dir : str
            Desired output directory.
        """

        tree = dendropy.Tree.get_from_path(input_tree, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        # pull taxonomy from tree
        if not taxonomy_file:
            self.logger.info('Reading taxonomy from tree.')
            taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
            taxonomy = Taxonomy().read_from_tree(input_tree)
            Taxonomy().write(taxonomy, taxonomy_file)
        else:
            self.logger.info('Reading taxonomy from file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            
        # read trusted taxa
        trusted_taxa = None
        if trusted_taxa_file:
            trusted_taxa = read_taxa_file(trusted_taxa_file)
        
        # determine taxa to be used for inferring distribution
        taxa_for_dist_inference = filter_taxa_for_dist_inference(tree, taxonomy, set(), min_children, -1)
        
        # determine branch lengths to leaves for named lineages
        rank_bl_dist = defaultdict(list)
        taxa_bl_dist = defaultdict(list)
        taxa_at_rank = defaultdict(list)
        for node in tree.postorder_node_iter():
            if node.is_leaf() or not node.label:
                continue
                
            _support, taxon, _auxiliary_info = parse_label(node.label)
            if not taxon:
                continue
                
            # get most specific rank in multi-rank taxa string
            taxa = [t.strip() for t in taxon.split(';')]
            taxon = taxa[-1]
            
            most_specific_rank = taxon[0:3]
            taxa_at_rank[Taxonomy.rank_index[most_specific_rank]].append(taxon)
                
            for n in node.leaf_iter():
                dist_to_node = 0
                while n != node:
                    dist_to_node += n.edge_length
                    n = n.parent_node
                
                for t in taxa:
                    taxa_bl_dist[t].append(dist_to_node)

            rank = Taxonomy.rank_labels[Taxonomy.rank_index[most_specific_rank]]
            if rank != 'species' or Taxonomy().validate_species_name(taxon):
                if taxon in taxa_for_dist_inference:
                    rank_bl_dist[rank].append(np_mean(taxa_bl_dist[taxon]))
                            
        # report number of taxa at each rank
        print ''
        print 'Rank\tTaxa\tTaxa for Inference'
        for rank, taxa in taxa_at_rank.iteritems():
            taxa_for_inference = [x for x in taxa if x in taxa_for_dist_inference]
            print '%s\t%d\t%d' % (Taxonomy.rank_labels[rank], len(taxa), len(taxa_for_inference))
        print ''
                    
        # report results sorted by rank
        sorted_taxon = []
        for rank_prefix in Taxonomy.rank_prefixes:
            taxa_at_rank = []
            for taxon in taxa_bl_dist:
                if taxon.startswith(rank_prefix):
                    taxa_at_rank.append(taxon)
                    
            sorted_taxon += sorted(taxa_at_rank)
                
        # report results for each named group
        taxa_file = os.path.join(output_dir, 'taxa_bl_dist.tsv')
        fout = open(taxa_file, 'w')
        fout.write('Taxa\tUsed for Inference\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for taxon in sorted_taxon:
            dist = taxa_bl_dist[taxon]

            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (taxon,
                                                                str(taxon in taxa_for_dist_inference),
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
        # report results for each taxonomic rank
        rank_file = os.path.join(output_dir, 'rank_bl_dist.tsv')
        fout = open(rank_file, 'w')
        fout.write('Rank\tMean\tStd\t5th\t10th\t50th\t90th\t95th\n')
        for rank in Taxonomy.rank_labels:
            dist = rank_bl_dist[rank]
            p = np_percentile(dist, [5, 10, 50, 90, 95])
            fout.write('%s\t%g\t%g\t%g\t%g\t%g\t%g\t%g\n' % (rank,
                                                                np_mean(dist),
                                                                np_std(dist),
                                                                p[0], p[1], p[2], p[3], p[4]))
        fout.close()
        
Example #14
0
    def _distribution_summary_plot(self, 
                                    phylum_rel_dists, 
                                    taxa_for_dist_inference, 
                                    highlight_polyphyly,
                                    highlight_taxa,
                                    fmeasure,
                                    fmeasure_mono,
                                    plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            if not v:
                # not taxa at rank suitable for creating classification boundaries
                continue
            
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            #ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.0, 0.0, 1.0), lw=2, zorder=2)
            #ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.1, 0.1]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (0.0, 0.0, 0.0)
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label.capitalize() + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            near_mono = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa):
                    c.append((1.0,0.0,0.0))
                    poly.append(md)
                elif (highlight_polyphyly and fmeasure[clade_label] != 1.0):
                    c.append((255.0/255,187.0/255,120.0/255))
                    near_mono.append(md)
                else:
                    c.append((152.0/255,223.0/255,138.0/255))
                    mono.append(md)

            # histogram for each rank
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)
            max_bin_count = max(np_histogram(mono + near_mono + poly, bins=bins)[0])

            mono_bottom = 0
            near_mono_bottom = 0
            mono = np_array(mono)
            near_mono = np_array(near_mono)
            poly = np_array(poly)
            if len(mono) > 0:
                mono_bottom, b, p = ax.hist(mono, bins=bins,
                          color=(152.0/255,223.0/255,138.0/255),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono),
                          bottom=i,
                          lw=0,
                          zorder=0)

            if len(near_mono) > 0:
                near_mono_bottom, b, p = ax.hist(near_mono, bins=bins,
                                              color=(255.0/255,187.0/255,120.0/255),
                                              alpha=0.5,
                                              weights=0.9 * (1.0 / max_bin_count) * np_ones_like(near_mono),
                                              bottom=i + mono_bottom,
                                              lw=0,
                                              zorder=0)

            if len(poly) > 0:
                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly),
                          bottom=i + mono_bottom + near_mono_bottom,
                          lw=0,
                          zorder=0)
                          
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('Relative Evolutionary Divergence')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('Rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
        self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
Example #15
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference,
                           highlight_polyphyly, highlight_taxa,
                           distribution_table, fmeasure, fmeasure_mono,
                           plot_file, viral):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].items()
                if taxa in taxa_for_dist_inference
            ]
            if len(v) == 0:
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p50, p50), (i, i + 0.5),
                    c=self.median_color,
                    lw=2,
                    zorder=2)

            for b in [-0.1, 0.1]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    ax.plot((boundary, boundary), (i, i + 0.25),
                            c=(0.0, 0.0, 0.0),
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write(
            'Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            if viral:
                rank_label = VIRAL_RANK_LABELS[rank]
            else:
                rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label.capitalize() +
                               ' ({:,})'.format(len(rel_dists[rank])))

            mono = []
            poly = []
            nearly_mono = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if ((highlight_polyphyly
                     and fmeasure[clade_label] < fmeasure_mono)
                        or clade_label in highlight_taxa):
                    c.append(self.poly_color)
                    poly.append(dist)
                elif (highlight_polyphyly and fmeasure[clade_label] != 1.0):
                    c.append(self.near_mono_color)
                    nearly_mono.append(dist)
                else:
                    c.append(self.mono_color)
                    mono.append(dist)

                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1, -1, -1] + [str(percentile_outlier)]

                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))

            # histogram for each rank
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)
            max_bin_count = max(
                np_histogram(mono + nearly_mono + poly, bins=bins)[0])

            num_taxa = len(mono) + len(poly) + len(nearly_mono)
            if num_taxa == 0:
                break

            mono = np_array(mono)
            nearly_mono = np_array(nearly_mono)
            poly = np_array(poly)

            bottom_mono = 0
            if len(mono) > 0:
                bottom_mono, b, p = ax.hist(
                    mono,
                    bins=bins,
                    color=self.mono_color,
                    alpha=0.5,
                    weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono),
                    bottom=i,
                    lw=0,
                    zorder=0)

            bottom_nearly_mono = 0
            if len(nearly_mono) > 0:
                bottom_nearly_mono, b, p = ax.hist(nearly_mono,
                                                   bins=bins,
                                                   color=self.near_mono_color,
                                                   alpha=0.5,
                                                   weights=0.9 *
                                                   (1.0 / max_bin_count) *
                                                   np_ones_like(nearly_mono),
                                                   bottom=i + bottom_mono,
                                                   lw=0,
                                                   zorder=0)

            if len(poly) > 0:
                ax.hist(poly,
                        bins=bins,
                        color=self.poly_color,
                        alpha=0.5,
                        weights=0.9 * (1.0 / max_bin_count) *
                        np_ones_like(poly),
                        bottom=i + bottom_mono + bottom_nearly_mono,
                        lw=0,
                        zorder=0)
        fout.close()

        # overlay scatter plot elements
        scatter = ax.scatter(x,
                             y,
                             alpha=0.5,
                             s=48,
                             c=c,
                             zorder=1,
                             lw=1,
                             edgecolors='black')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('Relative Evolutionary Divergence')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('Rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        if not self.skip_mpld3:
            mpld3.plugins.clear(self.fig)
            mpld3.plugins.connect(
                self.fig,
                mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
            mpld3.plugins.connect(self.fig,
                                  mpld3.plugins.MousePosition(fontsize=10))
            mpld3.plugins.connect(self.fig, AxisReplacer(rank_labels))
            mpld3.save_html(self.fig,
                            plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
        self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
Example #16
0
    def _distribution_summary_plot(self, phylum_rel_dists,
                                   taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [
                np_median(dists)
                for taxon, dists in medians_for_taxa[rank].items()
                if taxon in taxa_for_dist_inference
            ]
            if not v:
                # not taxa at rank suitable for creating classification
                # boundaries
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p50, p50), (i, i + 0.5),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p90, p90), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if 1.0 > boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5),
                            c=c,
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label +
                               ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].items():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if self._is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            n = 0
            if len(mono) > 0:
                mono = np_array(mono)
                no_inference = np_array(no_inference)
                poly = np_array(poly)
                binwidth = 0.025
                bins = np_arange(0, 1.0 + binwidth, binwidth)

                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                w = float(
                    len(mono)) / (len(mono) + len(poly) + len(no_inference))
                n, b, p = ax.hist(mono,
                                  bins=bins,
                                  color=(0.0, 0.0, 1.0),
                                  alpha=0.25,
                                  weights=0.9 * w * mono_weights,
                                  bottom=i,
                                  lw=0,
                                  zorder=0)

            if len(no_inference) > 0:
                no_inference_max_count = max(
                    np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (
                    1.0 / no_inference_max_count)

                ax.hist(no_inference,
                        bins=bins,
                        color=(0.3, 0.3, 0.3),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * no_inference_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly,
                        bins=bins,
                        color=(1.0, 0.0, 0.0),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * poly_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(list(range(0, len(medians_for_taxa))))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Example #17
0
    def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference, output_prefix):
        """Create plots showing correctly classified taxa for different relative distance values.

        Parameters
        ----------
        rel_dists : d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to consider when inferring relative divergence thresholds.
        output_prefix : str
            Prefix for plots.
        """

        print ''
        print '  Relative divergence thresholds (rank, threshold, parent taxa, child taxa):'

        ranks = sorted(rel_dists.keys())
        rel_dist_thresholds = []
        for i in xrange(ranks[0], ranks[-1]):
            parent_rank = i
            child_rank = i + 1

            # determine classification results for relative divergence
            # values between the medians of adjacent taxonomic ranks
            parent_rds = []
            for taxa, rd in rel_dists[parent_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    parent_rds.append(rd)
            parent_p50 = np_percentile(parent_rds, 50)

            child_rds = []
            for taxa, rd in rel_dists[child_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    child_rds.append(rd)

            child_p50 = np_percentile(child_rds, 50)

            r = []
            y_parent = []
            y_child = []
            y_mean_corr = []
            for test_r in np_linspace(parent_p50, child_p50, 100):
                parent_cor = float(sum([1 for rd in parent_rds if rd <= test_r])) / len(parent_rds)
                child_cor = float(sum([1 for rd in  child_rds if rd > test_r])) / len(child_rds)

                r.append(test_r)
                y_parent.append(parent_cor)
                y_child.append(child_cor)
                y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor)

            # create plot of correctly classified taxa
            self.fig.clear()
            self.fig.set_size_inches(6, 6)
            ax = self.fig.add_subplot(111)

            ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i])
            ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1])
            ax.plot(r, y_mean_corr, 'r-', label='mean')

            legend = ax.legend(loc='upper left')
            legend.draw_frame(False)

            # find maximum of mean correct classification
            max_mean = max(y_mean_corr)
            r_max_values = [r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean]
            r_max_value = np_mean(r_max_values)  # Note: this will fail if there are multiple local maxima
            print '    %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank], r_max_value, len(parent_rds), len(child_rds))

            # check that there is a single local maximum
            rd_indices = [i for i, rd in enumerate(y_mean_corr) if rd == max_mean]
            for rd_index in xrange(0, len(rd_indices) - 1):
                if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1:
                    print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.'

            rel_dist_thresholds.append(r_max_value)

            y_min, _y_max = ax.get_ylim()
            ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--')
            ax.text(r_max_value + 0.001, y_min + 0.01, '%.3f' % r_max_value, horizontalalignment='left')

            ax.set_xlabel('relative distance')
            ax.set_ylabel('% taxa correctly classified')

            self.prettify(ax)

            self.fig.tight_layout(pad=1)
            self.fig.savefig(output_prefix + '.%s_%s.png' % (Taxonomy.rank_labels[parent_rank], Taxonomy.rank_labels[child_rank]), dpi=96)

        print ''

        return rel_dist_thresholds
Example #18
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            d = len(mono) + len(poly) + len(no_inference)
            if d == 0:
                break
                
            w = float(len(mono)) / d
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Example #19
0
    def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file,
            taxonomy_file, output_prefix, min_children, title):

        # determine named clades in full tree
        named_clades = set()
        tree = dendropy.Tree.get_from_path(full_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        for node in tree.preorder_node_iter():
            if node.label:
                taxonomy = node.label.split(';')
                named_clades.add(taxonomy[-1].strip().split(':')[-1])

        print 'Identified %d named clades in full tree.' % len(named_clades)

        # determine named groups with at least the specified number of children
        print 'Determining taxa with sufficient named children lineages.'
        taxon_children = defaultdict(set)
        groups = defaultdict(list)
        print taxonomy_file
        for line in open(taxonomy_file):
            line_split = line.replace('; ', ';').split()
            genome_id = line_split[0]
            taxonomy = [x.strip() for x in line_split[1].split(';')]

            if len(taxonomy) > rank + 1:
                taxon_children[taxonomy[rank]].add(taxonomy[rank + 1])

            if len(taxonomy) > rank:
                groups[taxonomy[rank]].append(genome_id)

        groups_to_consider = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children and taxon in named_clades:
                groups_to_consider.add(taxon)

        print 'Assessing distribution over %d groups.' % len(
            groups_to_consider)

        # calculate RED for full tree
        print ''
        print 'Calculating RED over full tree.'
        tree = dendropy.Tree.get_from_path(full_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)
        full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(
            tree, groups_to_consider, groups)
        if len(polyphyletic) > 0:
            print ''
            print '[Warning] Full tree contains polyphyletic groups.'

        # calculate RED for dereplicated tree
        print ''
        print 'Calculating RED over dereplicated tree.'
        tree = dendropy.Tree.get_from_path(derep_tree_file,
                                           schema='newick',
                                           rooting='force-rooted',
                                           preserve_underscores=True)

        derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(
            tree, groups_to_consider, groups)

        groups_to_consider = groups_to_consider - polyphyletic
        print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(
            groups_to_consider)

        # calculate RED to each group in each tree
        print ''
        rel_dists = defaultdict(list)
        dist_components = defaultdict(list)
        for f in os.listdir(input_tree_dir):
            if not f.endswith('.rooted.tree'):
                continue

            print f

            tree_file = os.path.join(input_tree_dir, f)
            tree = dendropy.Tree.get_from_path(tree_file,
                                               schema='newick',
                                               rooting='force-rooted',
                                               preserve_underscores=True)

            # calculate relative distance to named taxa
            rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(
                tree, groups_to_consider, groups)

            for taxon, dist in rel_dist.iteritems():
                rel_dists[taxon].append(dist)
                dist_components[taxon].append(components[taxon])

        # create scatter plot
        x = []
        y = []
        xDerep = []
        yDerep = []
        xFull = []
        yFull = []
        perc10 = []
        perc90 = []
        labels = []
        fout = open(output_prefix + '.tsv', 'w')
        fout.write(
            'Taxon\tP10\tP90\tP90-P10\tMean RED\tMean dist to parent\tMean dist to leaves\tOriginal RED\tOrigial dist to parent\tOriginal dist to leaves\n'
        )
        for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)):
            labels.append(taxon + ' (%d)' % (len(rel_dists[taxon])))

            rd = rel_dists[taxon]
            for d in rd:
                x.append(d)
                y.append(i + 0.2)

            p10, p90 = np_percentile(rd, [10, 90])
            perc10.append(p10)
            perc90.append(p90)

            print taxon, p90 - p10
            mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0)
            derep_x, derep_a, derep_b = derep_dist_components[taxon]
            fout.write(
                '%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' %
                (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x,
                 derep_a, derep_b))

            xDerep.append(derep_rel_dist[taxon])
            yDerep.append(i)

            xFull.append(full_rel_dist[taxon])
            yFull.append(i)
        fout.close()

        self.fig.clear()
        self.fig.set_size_inches(8, len(rel_dists) * 0.4)
        ax = self.fig.add_subplot(111)

        ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s')
        ax.scatter(xDerep,
                   yDerep,
                   alpha=1.0,
                   s=24,
                   c=(1.0, 0.0, 0.0),
                   marker='s')
        ax.scatter(xFull,
                   yFull,
                   alpha=1.0,
                   s=24,
                   c=(0.0, 0.0, 1.0),
                   marker='*')

        for i in xrange(len(labels)):
            ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-')
            ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')
        if title:
            ax.set_title(title, size=12)

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('taxa')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(labels)

        self.prettify(ax)

        # make plot interactive
        # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12))

        # mpld3.save_html(fig, output_prefix + '.html')
        self.fig.tight_layout(pad=1)
        self.fig.savefig(output_prefix + '.png', dpi=300)
Example #20
0
    def run(self, rank, input_tree_dir, full_tree_file, derep_tree_file, taxonomy_file, output_prefix, min_children, title):

        # determine named clades in full tree
        named_clades = set()
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        for node in tree.preorder_node_iter():
            if node.label:
                taxonomy = node.label.split(';')
                named_clades.add(taxonomy[-1].strip().split(':')[-1])

        print 'Identified %d named clades in full tree.' % len(named_clades)

        # determine named groups with at least the specified number of children
        print 'Determining taxa with sufficient named children lineages.'
        taxon_children = defaultdict(set)
        groups = defaultdict(list)
        print taxonomy_file
        for line in open(taxonomy_file):
            line_split = line.replace('; ', ';').split()
            genome_id = line_split[0]
            taxonomy = [x.strip() for x in line_split[1].split(';')]

            if len(taxonomy) > rank + 1:
                taxon_children[taxonomy[rank]].add(taxonomy[rank + 1])

            if len(taxonomy) > rank:
                groups[taxonomy[rank]].append(genome_id)

        groups_to_consider = set()
        for taxon, children_taxa in taxon_children.iteritems():
            if len(children_taxa) >= min_children and taxon in named_clades:
                groups_to_consider.add(taxon)

        print 'Assessing distribution over %d groups.' % len(groups_to_consider)

        # calculate relative distance for full tree
        print ''
        print 'Calculating relative distance over full tree.'
        tree = dendropy.Tree.get_from_path(full_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        full_rel_dist, _full_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)
        if len(polyphyletic) > 0:
            print ''
            print '[Warning] Full tree contains polyphyletic groups.'

        # calculate relative distance for dereplicated tree
        print ''
        print 'Calculating relative distance over dereplicated tree.'
        tree = dendropy.Tree.get_from_path(derep_tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)
        
        derep_rel_dist, derep_dist_components, polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

        groups_to_consider = groups_to_consider - polyphyletic
        print 'Assessing distriubtion over %d groups after removing polyphyletic groups in original trees.' % len(groups_to_consider)

        # calculate relative distance to each group in each tree
        print ''
        rel_dists = defaultdict(list)
        dist_components = defaultdict(list)
        for f in os.listdir(input_tree_dir):
            if not f.endswith('.rooted.tree'):
                continue

            print f

            tree_file = os.path.join(input_tree_dir, f)
            tree = dendropy.Tree.get_from_path(tree_file, 
                                            schema='newick', 
                                            rooting='force-rooted', 
                                            preserve_underscores=True)

            # calculate relative distance to named taxa
            rel_dist, components, _polyphyletic = self.rel_dist_to_specified_groups(tree, groups_to_consider, groups)

            for taxon, dist in rel_dist.iteritems():
                rel_dists[taxon].append(dist)
                dist_components[taxon].append(components[taxon])

        # create scatter plot
        x = []
        y = []
        xDerep = []
        yDerep = []
        xFull = []
        yFull = []
        perc10 = []
        perc90 = []
        labels = []
        fout = open(output_prefix + '.tsv', 'w')
        fout.write('Taxon\tP10\tP90\tP90-P10\tMean rel. dist\tMean dist to parent\tMean dist to leaves\tOriginal rel. dist.\tOrigial dist to parent\tOriginal dist to leaves\n')
        for i, taxon in enumerate(sorted(rel_dists.keys(), reverse=True)):
            labels.append(taxon + ' (%d)' % (len(rel_dists[taxon])))

            rd = rel_dists[taxon]
            for d in rd:
                x.append(d)
                y.append(i + 0.2)

            p10, p90 = np_percentile(rd, [10, 90])
            perc10.append(p10)
            perc90.append(p90)

            print taxon, p90 - p10
            mean_x, mean_a, mean_b = np_mean(dist_components[taxon], axis=0)
            derep_x, derep_a, derep_b = derep_dist_components[taxon]
            fout.write('%s\t%.2f\t%.2f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n' % (taxon, p10, p90, p90 - p10, mean_x, mean_a, mean_b, derep_x, derep_a, derep_b))

            xDerep.append(derep_rel_dist[taxon])
            yDerep.append(i)

            xFull.append(full_rel_dist[taxon])
            yFull.append(i)
        fout.close()

        self.fig.clear()
        self.fig.set_size_inches(8, len(rel_dists) * 0.4)
        ax = self.fig.add_subplot(111)

        ax.scatter(x, y, alpha=0.5, s=24, c=(0.5, 0.5, 0.5), marker='s')
        ax.scatter(xDerep, yDerep, alpha=1.0, s=24, c=(1.0, 0.0, 0.0), marker='s')
        ax.scatter(xFull, yFull, alpha=1.0, s=24, c=(0.0, 0.0, 1.0), marker='*')

        for i in xrange(len(labels)):
            ax.plot((perc10[i], perc10[i]), (i, i + 0.4), 'r-')
            ax.plot((perc90[i], perc90[i]), (i, i + 0.4), 'r-')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')
        if title:
            ax.set_title(title, size=12)

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('taxa')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(labels)

        self.prettify(ax)

        # make plot interactive
        # mpld3.plugins.connect(fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        # mpld3.plugins.connect(fig, mpld3.plugins.MousePosition(fontsize=12))

        # mpld3.save_html(fig, output_prefix + '.html')
        self.fig.tight_layout(pad=1)
        self.fig.savefig(output_prefix + '.png', dpi=300)
Example #21
0
    def _percent_correct_plot(self, rel_dists, taxa_for_dist_inference,
                              output_prefix):
        """Create plots showing correctly classified taxa for different relative distance values.

        Parameters
        ----------
        rel_dists : d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to consider when inferring relative divergence thresholds.
        output_prefix : str
            Prefix for plots.
        """

        print ''
        print '  Relative divergence thresholds (rank, threshold, parent taxa, child taxa):'

        ranks = sorted(rel_dists.keys())
        rel_dist_thresholds = []
        for i in xrange(ranks[0], ranks[-1]):
            parent_rank = i
            child_rank = i + 1

            # determine classification results for relative divergence
            # values between the medians of adjacent taxonomic ranks
            parent_rds = []
            for taxa, rd in rel_dists[parent_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    parent_rds.append(rd)
            parent_p50 = np_percentile(parent_rds, 50)

            child_rds = []
            for taxa, rd in rel_dists[child_rank].iteritems():
                if taxa in taxa_for_dist_inference:
                    child_rds.append(rd)

            child_p50 = np_percentile(child_rds, 50)

            r = []
            y_parent = []
            y_child = []
            y_mean_corr = []
            for test_r in np_linspace(parent_p50, child_p50, 100):
                parent_cor = float(
                    sum([1 for rd in parent_rds if rd <= test_r
                         ])) / len(parent_rds)
                child_cor = float(sum([1 for rd in child_rds if rd > test_r
                                       ])) / len(child_rds)

                r.append(test_r)
                y_parent.append(parent_cor)
                y_child.append(child_cor)
                y_mean_corr.append(0.5 * parent_cor + 0.5 * child_cor)

            # create plot of correctly classified taxa
            self.fig.clear()
            self.fig.set_size_inches(6, 6)
            ax = self.fig.add_subplot(111)

            ax.plot(r, y_parent, 'k--', label=Taxonomy.rank_labels[i])
            ax.plot(r, y_child, 'k:', label=Taxonomy.rank_labels[i + 1])
            ax.plot(r, y_mean_corr, 'r-', label='mean')

            legend = ax.legend(loc='upper left')
            legend.draw_frame(False)

            # find maximum of mean correct classification
            max_mean = max(y_mean_corr)
            r_max_values = [
                r[i] for i, rd in enumerate(y_mean_corr) if rd == max_mean
            ]
            r_max_value = np_mean(
                r_max_values
            )  # Note: this will fail if there are multiple local maxima
            print '    %s\t%.3f\t%d\t%d' % (Taxonomy.rank_labels[parent_rank],
                                            r_max_value, len(parent_rds),
                                            len(child_rds))

            # check that there is a single local maximum
            rd_indices = [
                i for i, rd in enumerate(y_mean_corr) if rd == max_mean
            ]
            for rd_index in xrange(0, len(rd_indices) - 1):
                if rd_indices[rd_index] != rd_indices[rd_index + 1] - 1:
                    print '[Warning] There are multiple local maxima, so estimated relative divergence threshold will be invalid.'

            rel_dist_thresholds.append(r_max_value)

            y_min, _y_max = ax.get_ylim()
            ax.axvline(x=r_max_value, ymin=0, ymax=1, color='r', ls='--')
            ax.text(r_max_value + 0.001,
                    y_min + 0.01,
                    '%.3f' % r_max_value,
                    horizontalalignment='left')

            ax.set_xlabel('relative distance')
            ax.set_ylabel('% taxa correctly classified')

            self.prettify(ax)

            self.fig.tight_layout(pad=1)
            self.fig.savefig(output_prefix + '.%s_%s.png' %
                             (Taxonomy.rank_labels[parent_rank],
                              Taxonomy.rank_labels[child_rank]),
                             dpi=96)

        print ''

        return rel_dist_thresholds
Example #22
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)