def plot(self, plot_num, data, xlabel, ylabel, txt_position=None, center_xticks=False):
        """Create histogram for statistic."""
        
        self.axis = self.fig.add_subplot(self.rows, self.cols, plot_num)
        
        align = 'mid'
        if center_xticks:
            # not intuative, but setting the alignment to left
            # puts labels in the middle of each bar
            align = 'left' 
                        

        weights = np_ones_like(data)/float(len(data))
        num_bins = min(20, len(set(data))-1)
        counts, bins, patches = self.axis.hist(data, 
                                                bins=num_bins, 
                                                rwidth=0.9, 
                                                weights=weights, 
                                                color='#fdae6b',
                                                align=align)

        self.axis.set_xlabel(xlabel)
        self.axis.set_ylabel(ylabel)
        
        self.axis.xaxis.set_major_locator(MaxNLocator(integer=True))
        self.axis.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
        
        # report summary statistics
        stat_txt = f'median = {np_median(data):.1f}\n'
        stat_txt += f'mean = {np_mean(data):.1f}\n'
        stat_txt += f'std = {np_std(data):.1f}'
        if txt_position == 'left':
            self.axis.text(0.05, 0.95, 
                            stat_txt, 
                            transform=self.axis.transAxes,
                            fontsize=self.options.tick_font_size,
                            verticalalignment='top')
        elif txt_position == 'right':
            self.axis.text(0.95, 0.95, 
                            stat_txt, 
                            transform=self.axis.transAxes,
                            fontsize=self.options.tick_font_size,
                            verticalalignment='top',
                            horizontalalignment='right')

        self.prettify(self.axis)
        for loc, spine in self.axis.spines.items():
            if loc in ['right', 'top']:
                spine.set_color('none')

        self.fig.tight_layout(pad=0.1, w_pad=1.0, h_pad=1.0)
        self.draw()
Esempio n. 2
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].items() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            d = len(mono) + len(poly) + len(no_inference)
            if d == 0:
                break
                
            w = float(len(mono)) / d
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Esempio n. 3
0
    def _distribution_summary_plot(self, phylum_rel_dists,
                                   taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [
                np_median(dists)
                for taxon, dists in medians_for_taxa[rank].items()
                if taxon in taxa_for_dist_inference
            ]
            if not v:
                # not taxa at rank suitable for creating classification
                # boundaries
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p50, p50), (i, i + 0.5),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)
            ax.plot((p90, p90), (i, i + 0.25),
                    c=(0.3, 0.3, 0.3),
                    lw=2,
                    zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if 1.0 > boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5),
                            c=c,
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label +
                               ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].items():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if self._is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            n = 0
            if len(mono) > 0:
                mono = np_array(mono)
                no_inference = np_array(no_inference)
                poly = np_array(poly)
                binwidth = 0.025
                bins = np_arange(0, 1.0 + binwidth, binwidth)

                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                w = float(
                    len(mono)) / (len(mono) + len(poly) + len(no_inference))
                n, b, p = ax.hist(mono,
                                  bins=bins,
                                  color=(0.0, 0.0, 1.0),
                                  alpha=0.25,
                                  weights=0.9 * w * mono_weights,
                                  bottom=i,
                                  lw=0,
                                  zorder=0)

            if len(no_inference) > 0:
                no_inference_max_count = max(
                    np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (
                    1.0 / no_inference_max_count)

                ax.hist(no_inference,
                        bins=bins,
                        color=(0.3, 0.3, 0.3),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * no_inference_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly,
                        bins=bins,
                        color=(1.0, 0.0, 0.0),
                        alpha=0.25,
                        weights=0.9 * (1.0 - w) * poly_weights,
                        bottom=i + n,
                        lw=0,
                        zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(list(range(0, len(medians_for_taxa))))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(
            self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig,
                              mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Esempio n. 4
0
    def _distribution_summary_plot(self, 
                                    phylum_rel_dists, 
                                    taxa_for_dist_inference, 
                                    highlight_polyphyly,
                                    highlight_taxa,
                                    fmeasure,
                                    fmeasure_mono,
                                    plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            if not v:
                # not taxa at rank suitable for creating classification boundaries
                continue
            
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            #ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.0, 0.0, 1.0), lw=2, zorder=2)
            #ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.1, 0.1]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (0.0, 0.0, 0.0)
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label.capitalize() + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            near_mono = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if ((highlight_polyphyly and fmeasure[clade_label] < fmeasure_mono) or clade_label in highlight_taxa):
                    c.append((1.0,0.0,0.0))
                    poly.append(md)
                elif (highlight_polyphyly and fmeasure[clade_label] != 1.0):
                    c.append((255.0/255,187.0/255,120.0/255))
                    near_mono.append(md)
                else:
                    c.append((152.0/255,223.0/255,138.0/255))
                    mono.append(md)

            # histogram for each rank
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)
            max_bin_count = max(np_histogram(mono + near_mono + poly, bins=bins)[0])

            mono_bottom = 0
            near_mono_bottom = 0
            mono = np_array(mono)
            near_mono = np_array(near_mono)
            poly = np_array(poly)
            if len(mono) > 0:
                mono_bottom, b, p = ax.hist(mono, bins=bins,
                          color=(152.0/255,223.0/255,138.0/255),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono),
                          bottom=i,
                          lw=0,
                          zorder=0)

            if len(near_mono) > 0:
                near_mono_bottom, b, p = ax.hist(near_mono, bins=bins,
                                              color=(255.0/255,187.0/255,120.0/255),
                                              alpha=0.5,
                                              weights=0.9 * (1.0 / max_bin_count) * np_ones_like(near_mono),
                                              bottom=i + mono_bottom,
                                              lw=0,
                                              zorder=0)

            if len(poly) > 0:
                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.5,
                          weights=0.9 * (1.0 / max_bin_count) * np_ones_like(poly),
                          bottom=i + mono_bottom + near_mono_bottom,
                          lw=0,
                          zorder=0)
                          
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('Relative Evolutionary Divergence')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('Rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
        self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
Esempio n. 5
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference,
                           highlight_polyphyly, highlight_taxa,
                           distribution_table, fmeasure, fmeasure_mono,
                           plot_file, viral):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [
                dist for taxa, dist in rel_dists[rank].items()
                if taxa in taxa_for_dist_inference
            ]
            if len(v) == 0:
                continue

            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p50, p50), (i, i + 0.5),
                    c=self.median_color,
                    lw=2,
                    zorder=2)

            for b in [-0.1, 0.1]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    ax.plot((boundary, boundary), (i, i + 0.25),
                            c=(0.0, 0.0, 0.0),
                            lw=2,
                            zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write(
            'Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            if viral:
                rank_label = VIRAL_RANK_LABELS[rank]
            else:
                rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label.capitalize() +
                               ' ({:,})'.format(len(rel_dists[rank])))

            mono = []
            poly = []
            nearly_mono = []
            for clade_label, dist in rel_dists[rank].items():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if ((highlight_polyphyly
                     and fmeasure[clade_label] < fmeasure_mono)
                        or clade_label in highlight_taxa):
                    c.append(self.poly_color)
                    poly.append(dist)
                elif (highlight_polyphyly and fmeasure[clade_label] != 1.0):
                    c.append(self.near_mono_color)
                    nearly_mono.append(dist)
                else:
                    c.append(self.mono_color)
                    mono.append(dist)

                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1, -1, -1] + [str(percentile_outlier)]

                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))

            # histogram for each rank
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)
            max_bin_count = max(
                np_histogram(mono + nearly_mono + poly, bins=bins)[0])

            num_taxa = len(mono) + len(poly) + len(nearly_mono)
            if num_taxa == 0:
                break

            mono = np_array(mono)
            nearly_mono = np_array(nearly_mono)
            poly = np_array(poly)

            bottom_mono = 0
            if len(mono) > 0:
                bottom_mono, b, p = ax.hist(
                    mono,
                    bins=bins,
                    color=self.mono_color,
                    alpha=0.5,
                    weights=0.9 * (1.0 / max_bin_count) * np_ones_like(mono),
                    bottom=i,
                    lw=0,
                    zorder=0)

            bottom_nearly_mono = 0
            if len(nearly_mono) > 0:
                bottom_nearly_mono, b, p = ax.hist(nearly_mono,
                                                   bins=bins,
                                                   color=self.near_mono_color,
                                                   alpha=0.5,
                                                   weights=0.9 *
                                                   (1.0 / max_bin_count) *
                                                   np_ones_like(nearly_mono),
                                                   bottom=i + bottom_mono,
                                                   lw=0,
                                                   zorder=0)

            if len(poly) > 0:
                ax.hist(poly,
                        bins=bins,
                        color=self.poly_color,
                        alpha=0.5,
                        weights=0.9 * (1.0 / max_bin_count) *
                        np_ones_like(poly),
                        bottom=i + bottom_mono + bottom_nearly_mono,
                        lw=0,
                        zorder=0)
        fout.close()

        # overlay scatter plot elements
        scatter = ax.scatter(x,
                             y,
                             alpha=0.5,
                             s=48,
                             c=c,
                             zorder=1,
                             lw=1,
                             edgecolors='black')

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('Relative Evolutionary Divergence')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('Rank (no. taxa)')
        ax.set_yticks(range(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        if not self.skip_mpld3:
            mpld3.plugins.clear(self.fig)
            mpld3.plugins.connect(
                self.fig,
                mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
            mpld3.plugins.connect(self.fig,
                                  mpld3.plugins.MousePosition(fontsize=10))
            mpld3.plugins.connect(self.fig, AxisReplacer(rank_labels))
            mpld3.save_html(self.fig,
                            plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
        self.fig.savefig(plot_file.replace('.png', '.svg'), dpi=self.dpi)
Esempio n. 6
0
def r3_dnn_apply_keras(target_dirname,
                       old_stft_obj=None,
                       cuda=False,
                       saving_to_disk=True):
    LOGGER.info(
        '{}: r3: Denoising original stft with neural network model...'.format(
            target_dirname))
    '''
    r3_dnn_apply takes an old_stft object (or side effect load from disk)
    and saves a new_stft object
    '''
    scan_battery_dirname = os_path_dirname(target_dirname)
    model_dirname = os_path_dirname(os_path_dirname(scan_battery_dirname))

    # load stft data
    if old_stft_obj is None:
        old_stft_fpath = os_path_join(target_dirname, 'old_stft.mat')
        with h5py_File(old_stft_fpath, 'r') as f:
            stft = np_concatenate(
                [f['old_stft_real'][:], f['old_stft_imag'][:]], axis=1)
    else:
        stft = np_concatenate(
            [old_stft_obj['old_stft_real'], old_stft_obj['old_stft_imag']],
            axis=1)

    N_beams, N_elements_2, N_segments, N_fft = stft.shape
    N_elements = N_elements_2 // 2

    # combine stft_real and stft_imag

    # move element position axis
    stft = np_moveaxis(stft, 1, 2)  # TODO: Duplicate?

    # reshape the to flatten first two axes
    stft = np_reshape(
        stft, [N_beams * N_segments, N_elements_2, N_fft])  # TODO: Duplicate?

    # process stft with networks
    k_mask = list(range(3, 6))
    for frequency in k_mask:
        process_each_frequency_keras(model_dirname, stft, frequency)

    # reshape the stft data
    stft = np_reshape(
        stft, [N_beams, N_segments, N_elements_2, N_fft])  # TODO: Duplicate?

    # set zero outside analysis frequency range
    discard_mask = np_ones_like(stft, dtype=bool)
    discard_mask[:, :, :, k_mask] = False  # pylint: disable=E1137
    stft[discard_mask] = 0
    del discard_mask

    # mirror data to negative frequencies using conjugate symmetry
    end_index = N_fft // 2
    stft[:, :, :, end_index + 1:] = np_flip(stft[:, :, :, 1:end_index], axis=3)
    stft[:, :, N_elements:2 * N_elements, end_index +
         1:] = -1 * stft[:, :, N_elements:2 * N_elements, end_index + 1:]

    # move element position axis
    stft = np_moveaxis(stft, 1, 2)  # TODO: Duplicate?

    # change variable names
    # new_stft_real = stft[:, :N_elements, :, :]
    new_stft_real = stft[:, :N_elements, :, :].transpose()
    # new_stft_imag = stft[:, N_elements:, :, :]
    new_stft_imag = stft[:, N_elements:, :, :].transpose()

    del stft

    # change dimensions
    # new_stft_real = new_stft_real.transpose()
    # new_stft_imag = new_stft_imag.transpose()

    # save new stft data
    new_stft_obj = {
        'new_stft_real': new_stft_real,
        'new_stft_imag': new_stft_imag
    }
    if saving_to_disk is True:
        new_stft_fname = os_path_join(target_dirname, 'new_stft.mat')
        savemat(new_stft_fname, new_stft_obj)
    LOGGER.info('{}: r3 Done.'.format(target_dirname))
    return new_stft_obj
Esempio n. 7
0
    def _distribution_summary_plot(self, phylum_rel_dists, taxa_for_dist_inference, plot_file):
        """Summary plot showing the distribution of taxa at each taxonomic rank under different rootings.

        Parameters
        ----------
        phylum_rel_dists: phylum_rel_dists[phylum][rank_index][taxon] -> relative divergences
            Relative divergence of taxon at each rank for different phylum-level rootings.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)

        # determine median relative distance for each taxa
        medians_for_taxa = self.taxa_median_rd(phylum_rel_dists)

        # create percentile and classification boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            v = [np_median(dists) for taxon, dists in medians_for_taxa[rank].iteritems() if taxon in taxa_for_dist_inference]
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

        # create scatter plot and results table
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(medians_for_taxa.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(medians_for_taxa[rank]))

            mono = []
            poly = []
            no_inference = []
            for clade_label, dists in medians_for_taxa[rank].iteritems():
                md = np_median(dists)
                x.append(md)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(md)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(md)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(md)

            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            mono_max_count = max(np_histogram(mono, bins=bins)[0])
            mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n, b, p = ax.hist(mono, bins=bins,
                      color=(0.0, 0.0, 1.0),
                      alpha=0.25,
                      weights=0.9 * w * mono_weights,
                      bottom=i,
                      lw=0,
                      zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.01, 1.01])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(medians_for_taxa)))
        ax.set_ylim([-0.2, len(medians_for_taxa) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)
Esempio n. 8
0
    def _distribution_plot(self, rel_dists, taxa_for_dist_inference, distribution_table, plot_file):
        """Create plot showing the distribution of taxa at each taxonomic rank.

        Parameters
        ----------
        rel_dists: d[rank_index][taxon] -> relative divergence
            Relative divergence of taxa at each rank.
        taxa_for_dist_inference : iterable
            Taxa to considered when inferring distributions.
        distribution_table : str
            Desired name of output table with distribution information.
        plot_file : str
            Desired name of output plot.
        """

        self.fig.clear()
        self.fig.set_size_inches(12, 6)
        ax = self.fig.add_subplot(111)
        
        
        # create normal distributions
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) < 2:
                continue
                
            u = np_mean(v)
            rv = norm(loc=u, scale=np_std(v))
            x = np_linspace(rv.ppf(0.001), rv.ppf(0.999), 1000)
            nd = rv.pdf(x)
            # ax.plot(x, 0.75 * (nd / max(nd)) + i, 'b-', alpha=0.6, zorder=2)
            # ax.plot((u, u), (i, i + 0.5), 'b-', zorder=2)

        # create percentile and classifciation boundary lines
        percentiles = {}
        for i, rank in enumerate(sorted(rel_dists.keys())):
            v = [dist for taxa, dist in rel_dists[rank].iteritems() if taxa in taxa_for_dist_inference]
            if len(v) == 0:
                continue
                
            p10, p50, p90 = np_percentile(v, [10, 50, 90])
            ax.plot((p10, p10), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p50, p50), (i, i + 0.5), c=(0.3, 0.3, 0.3), lw=2, zorder=2)
            ax.plot((p90, p90), (i, i + 0.25), c=(0.3, 0.3, 0.3), lw=2, zorder=2)

            for b in [-0.2, -0.1, 0.1, 0.2]:
                boundary = p50 + b
                if boundary < 1.0 and boundary > 0.0:
                    if abs(b) == 0.1:
                        c = (1.0, 0.65, 0.0)  # orange
                    else:
                        c = (1.0, 0.0, 0.0)
                    ax.plot((boundary, boundary), (i, i + 0.5), c=c, lw=2, zorder=2)

            percentiles[i] = [p10, p50, p90]

    
        # create scatter plot and results table
        fout = open(distribution_table, 'w')
        fout.write('Taxa\tRelative Distance\tP10\tMedian\tP90\tPercentile outlier\n')
        x = []
        y = []
        c = []
        labels = []
        rank_labels = []
        for i, rank in enumerate(sorted(rel_dists.keys())):
            rank_label = Taxonomy.rank_labels[rank]
            rank_labels.append(rank_label + ' (%d)' % len(rel_dists[rank]))
            
            mono = []
            poly = []
            no_inference = []
            for clade_label, dist in rel_dists[rank].iteritems():
                x.append(dist)
                y.append(i)
                labels.append(clade_label)

                if is_integer(clade_label.split('^')[-1]):
                    # taxa with a numerical suffix after a caret indicate 
                    # polyphyletic groups when decorated with tax2tree
                    c.append((1.0, 0.0, 0.0))
                    poly.append(dist)
                elif clade_label not in taxa_for_dist_inference:
                    c.append((0.3, 0.3, 0.3))
                    no_inference.append(dist)
                else:
                    c.append((0.0, 0.0, 1.0))
                    mono.append(dist)
            
                # report results
                v = [clade_label, dist]
                if i in percentiles:
                    p10, p50, p90 = percentiles[i]
                    percentile_outlier = not (dist >= p10 and dist <= p90)
                    v += percentiles[i] + [str(percentile_outlier)]
                else:
                    percentile_outlier = 'Insufficent data to calculate percentiles'
                    v += [-1,-1,-1] + [str(percentile_outlier)]
                
                fout.write('%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\n' % tuple(v))
        
            # histogram for each rank
            mono = np_array(mono)
            no_inference = np_array(no_inference)
            poly = np_array(poly)
            binwidth = 0.025
            bins = np_arange(0, 1.0 + binwidth, binwidth)

            w = float(len(mono)) / (len(mono) + len(poly) + len(no_inference))
            n = 0
            if len(mono) > 0:
                mono_max_count = max(np_histogram(mono, bins=bins)[0])
                mono_weights = np_ones_like(mono) * (1.0 / mono_max_count)

                n, b, p = ax.hist(mono, bins=bins,
                          color=(0.0, 0.0, 1.0),
                          alpha=0.25,
                          weights=0.9 * w * mono_weights,
                          bottom=i,
                          lw=0,
                          zorder=0)
                      
            if len(no_inference) > 0:
                no_inference_max_count = max(np_histogram(no_inference, bins=bins)[0])
                no_inference_weights = np_ones_like(no_inference) * (1.0 / no_inference_max_count)

                ax.hist(no_inference, bins=bins,
                          color=(0.3, 0.3, 0.3),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * no_inference_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)

            if len(poly) > 0:
                poly_max_count = max(np_histogram(poly, bins=bins)[0])
                poly_weights = np_ones_like(poly) * (1.0 / poly_max_count)

                ax.hist(poly, bins=bins,
                          color=(1.0, 0.0, 0.0),
                          alpha=0.25,
                          weights=0.9 * (1.0 - w) * poly_weights,
                          bottom=i + n,
                          lw=0,
                          zorder=0)
                          
        fout.close()

    
        # overlay scatter plot elements
        scatter = ax.scatter(x, y, alpha=0.5, s=48, c=c, zorder=1)

        # set plot elements
        ax.grid(color=(0.8, 0.8, 0.8), linestyle='dashed')

        ax.set_xlabel('relative distance')
        ax.set_xticks(np_arange(0, 1.05, 0.1))
        ax.set_xlim([-0.05, 1.05])

        ax.set_ylabel('rank (no. taxa)')
        ax.set_yticks(xrange(0, len(rel_dists)))
        ax.set_ylim([-0.2, len(rel_dists) - 0.01])
        ax.set_yticklabels(rank_labels)

        self.prettify(ax)

        # make plot interactive
        mpld3.plugins.clear(self.fig)
        mpld3.plugins.connect(self.fig, mpld3.plugins.PointLabelTooltip(scatter, labels=labels))
        mpld3.plugins.connect(self.fig, mpld3.plugins.MousePosition(fontsize=10))
        mpld3.save_html(self.fig, plot_file[0:plot_file.rfind('.')] + '.html')

        self.fig.tight_layout(pad=1)
        self.fig.savefig(plot_file, dpi=self.dpi)