Example #1
    def compatible(self, scaffolds_of_interest,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with compatible genomic characteristics.

        Compatible scaffolds are identified based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology]
            Scaffolds to consider for compatibility.
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds to check.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int
            Percentile for identifying GC outliers.
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.

        # read reference distributions from file
        self.logger.info('Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify compatible scaffolds in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions')
        fout.write('\tScaffold GC\tMedian genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMedian genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tScaffold coverage\tMedian genome coverage\tCoverage correlation\tCoverage error')
        fout.write('\t# genes\t% genes with homology\n')

        genomic_signature = GenomicSignature(0)

        self.logger.info('Identifying scaffolds compatible with bins.')
        processed_scaffolds = 0
        for scaffold_id, ss in scaffold_stats.stats.items():
            processed_scaffolds += 1
            if not self.logger.is_silent:
                sys.stdout.write('  Processed {:,} of {:,} ({:.1f}%) scaffolds.\r'.format(
                                    processed_scaffolds * 100.0 / len(scaffold_stats.stats)))

            if scaffold_id not in scaffolds_of_interest:

            for genome_id, gs in genome_stats.items():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
                sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.median_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)

                # determine if scaffold compatible
                compatible_dists = []
                if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound:

                if delta_td <= td_bound:

                corr_r = 1.0
                if len(gs.median_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.median_coverage, ss.coverage)
                    if  corr_r >= cov_corr:

                mean_cp = []
                for cov_genome, cov_scaffold in zip(gs.median_coverage, ss.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome)

                mean_cp = np_mean(mean_cp)
                if mean_cp <= cov_perc:

                # report compatible scaffolds
                if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.median_gc, gs.median_gc + gc_lower_bound * 100, gs.median_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.median_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.median_coverage), corr_r, mean_cp))
                    fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1]))

        if not self.logger.is_silent:
    def plot_on_axes(self, figure,
                     highlight_scaffold_ids, link_scaffold_ids,
                     mean_signature, td_dist, percentiles_to_plot,
                     axes_hist, axes_scatter, tooltip_plugin):
        """Create histogram and scatterplot.

        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_signature : float
          Mean tetranucleotide signature of genome.
        td_dist : d[length][percentile] -> critical value
          TD distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.

        # histogram plot
        genomic_signature = GenomicSignature(0)

        delta_tds = []
        for stats in genome_scaffold_stats.values():
            delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature))

        if axes_hist:
            axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5))
            axes_hist.set_xlabel('tetranucleotide distance')
            axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds))

        # scatterplot
        xlabel = 'tetranucleotide distance'
        ylabel = 'Scaffold length (kbp)'

        scaffold_stats = {}
        for i, (scaffold_id, stats) in enumerate(genome_scaffold_stats.iteritems()):
            scaffold_stats[scaffold_id] = (delta_tds[i], stats.length / 1000.0)

        scatter, labels = self.scatter(axes_scatter,
                                         xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # plot reference distributions
        for percentile in percentiles_to_plot:
            # find closest distribution values
            td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile)

            x = []
            y = []
            for window_size in td_dist:
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            x = np.array(x)[sort_indexY]
            y = np.array(y)[sort_indexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # prettify scatterplot

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter, labels=labels, hoffset=5, voffset=-15)
            mpld3.plugins.connect(figure, tooltip)

        return scatter
    def outlier_info(self,

        genomic_signature = GenomicSignature(0)
        # make sure distributions have been loaded
        # find keys into GC and TD distributions
        # gc -> [mean GC][scaffold length][percentile]
        # td -> [scaffold length][percentile]
        gs = genome_stats[genome_id]
        closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
        sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
        d = self.gc_dist[closest_gc][sample_seq_len]
        gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
        gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

        td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
        outlying_stats = {}
        outlying_dists = defaultdict(list)
        for scaffold_id in scaffold_ids:
            base_scaffold_id = scaffold_id
            if '-#' in scaffold_id:
                base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
            stats = scaffold_stats.stats[base_scaffold_id]

            # find GC and TD bounds
            closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
            gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
            gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

            closest_seq_len = find_nearest(list(self.td_dist.keys()), stats.length)
            td_bound = self.td_dist[closest_seq_len][td_bound_key]

            # find changes from median
            delta_gc = (stats.gc - gs.median_gc) / 100.0
            delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature)

            # determine if scaffold is an outlier
            if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound:

            if delta_td > td_bound:
            # care is required for coverage, since this information
            # is not always provided
            if len(gs.median_coverage) >= 1: 
                # there is coverage information
                mean_genome_cov = np_mean(gs.median_coverage)
                if len(stats.coverage) == 0:
                    # however, this scaffold has no reported 
                    # coverage so flag it as a likely outlier
                    mean_scaffold_cov = 0
                    corr_r = -1000
                    mean_cp_err = -1000
                    mean_scaffold_cov = np_mean(stats.coverage)

                    corr_r = 1.0
                    if len(gs.median_coverage) > 1:
                            corr_r, _corr_p = pearsonr(gs.median_coverage, stats.coverage)
                            if corr_r < cov_corr:
                            self.logger.warning('Failed to calculate Pearson correlation for %s.' % scaffold_id)
                            if sum(gs.median_coverage) == 0:
                                self.logger.warning('Median coverage of %s is zero across all samples.' % genome_id)
                            if sum(stats.coverage) == 0:
                                self.logger.warning('Contig %s has zero coverage across all samples.' % scaffold_id)

                    mean_cp_err = []
                    for cov_genome, cov_scaffold in zip(gs.median_coverage, stats.coverage):
                        mean_cp_err.append(abs(cov_scaffold - cov_genome) * 100.0 / max(cov_genome, self.min_required_coverage))
                    mean_cp_err = np_mean(mean_cp_err)                        
                    if mean_cp_err > cov_perc:
                # no coverage information was provided
                mean_genome_cov = 0
                mean_scaffold_cov = 0
                corr_r = 1.0
                mean_cp_err = 0.0
            outlying_stats[scaffold_id] = self.OutlierInfo(stats.length,
                                                            gs.median_gc + gc_lower_bound * 100,
                                                            gs.median_gc + gc_upper_bound * 100,
        return outlying_stats, outlying_dists
    def plot_on_axes(self, figure,
                     highlight_scaffold_ids, link_scaffold_ids,
                     mean_gc, gc_dist, percentiles_to_plot,
                     axes_hist, axes_scatter, tooltip_plugin):
        """Create histogram and scatterplot.

        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats : d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_gc : float
          Mean GC of genome.
        gc_dist : d[gc][length][percentile] -> critical value
          GC distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.

        # histogram plot
        if axes_hist:
            scaffold_gc = [stats.gc for stats in genome_scaffold_stats.values()]
            ylabel = '# scaffolds (out of %d)' % len(scaffold_gc)
            self.histogram(axes_hist, scaffold_gc, 20, 80, 2, '% GC', ylabel)

        # scatterplot
        xlabel = 'deviation in GC (mean GC = %.1f%%)' % mean_gc
        ylabel = 'Scaffold length (kbp)'

        scaffold_stats = {}
        for scaffold_id, stats in genome_scaffold_stats.iteritems():
            scaffold_stats[scaffold_id] = (stats.gc - mean_gc, stats.length / 1000.0)

        scatter, labels = self.scatter(axes_scatter,
                                         xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # draw vertical line at x=0
        axes_scatter.plot([0, 0], [0, ymax], linestyle='dashed', color=self.axes_colour, lw=1.0, zorder=0)

        # plot reference distributions
        closest_gc = find_nearest(np.array(gc_dist.keys()), mean_gc / 100)
        for percentile in percentiles_to_plot:
            # find closest distribution values
            temp_scaffold_len = gc_dist[closest_gc].keys()[0]
            d = gc_dist[closest_gc][temp_scaffold_len]
            gc_lower_bound_key = find_nearest(d.keys(), (100 - percentile) / 2.0)
            gc_upper_bound_key = find_nearest(d.keys(), (100 + percentile) / 2.0)

            xL = []
            xU = []
            y = []
            for window_size in gc_dist[closest_gc]:
                xL.append(gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
                xU.append(gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            xL = np.array(xL)[sort_indexY]
            xU = np.array(xU)[sort_indexY]
            y = np.array(y)[sort_indexY]
            axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
            axes_scatter.plot(xU, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter, labels=labels, hoffset=5, voffset=-15)
            mpld3.plugins.connect(figure, tooltip)

        return scatter
    def identify(self, scaffold_stats, genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with divergent genomic characteristics.

        Outliers are identified independently based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int.
            Percentile for identifying GC outliers
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.

        # read reference distributions from file
        self.logger.info('  Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify outliers in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tOutlying distributions')
        fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error\n')

        genomic_signature = GenomicSignature(0)

        processed_genomes = 0
        for genome_id, scaffold_ids in scaffold_stats.scaffolds_in_genome.iteritems():
            processed_genomes += 1

            sys.stdout.write('    Finding outliers in %d of %d (%.1f%%) genomes.\r' % (processed_genomes,
                                                                                     processed_genomes * 100.0 / scaffold_stats.num_genomes()))

            # find keys into GC and TD distributions
            # gc -> [mean GC][scaffold length][percentile]
            # td -> [scaffold length][percentile]
            gs = genome_stats[genome_id]
            closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0)
            sample_seq_len = self.gc_dist[closest_gc].keys()[0]
            d = self.gc_dist[closest_gc][sample_seq_len]
            gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0)
            gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0)

            td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per)

            for scaffold_id in scaffold_ids:
                stats = scaffold_stats.stats[scaffold_id]

                # find GC and TD bounds
                closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), stats.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(self.td_dist.keys(), stats.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (stats.gc - gs.mean_gc) / 100.0
                delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature)

                # determine if scaffold is an outlier
                outlying_dists = []
                if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound:

                if delta_td > td_bound:

                corr_r = 1.0
                if len(gs.mean_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.mean_coverage, stats.coverage)
                    if  corr_r < cov_corr:

                mean_cp = []
                for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, stats.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_scaffold - cov_genome) * 100.0 / cov_genome)

                if len(mean_cp) == 0:
                    # genome has zero coverage which is general
                    # will indicate something is wrong
                    mean_cp = -1
                    mean_cp = np_mean(mean_cp)
                    if mean_cp > cov_perc:

                # report outliers
                if (report_type == 'any' and len(outlying_dists) >= 1) or (report_type == 'all' and len(outlying_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, stats.length, ','.join(outlying_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (stats.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(stats.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp))

    def plot_on_axes(self, figure, genome_scaffold_stats,
                     highlight_scaffold_ids, link_scaffold_ids, mean_gc,
                     gc_dist, percentiles_to_plot, axes_hist, axes_scatter,
        """Create histogram and scatterplot.

        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats : d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_gc : float
          Mean GC of genome.
        gc_dist : d[gc][length][percentile] -> critical value
          GC distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.

        # histogram plot
        if axes_hist:
            scaffold_gc = [
                stats.gc for stats in genome_scaffold_stats.values()
            ylabel = '# scaffolds (out of %d)' % len(scaffold_gc)
            self.histogram(axes_hist, scaffold_gc, 20, 80, 2, '% GC', ylabel)

        # scatterplot
        xlabel = 'delta GC (mean = %.1f%%)' % mean_gc
        ylabel = 'Scaffold length (kbp)'

        pts = self.data_pts(genome_scaffold_stats, mean_gc)

        scatter, x_pts, y_pts, plot_labels = self.scatter(
            axes_scatter, pts, highlight_scaffold_ids, link_scaffold_ids,
            xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # draw vertical line at x=0
        axes_scatter.plot([0, 0], [0, ymax],

        # plot reference distributions
        closest_gc = find_nearest(np.array(gc_dist.keys()), mean_gc / 100)
        for percentile in percentiles_to_plot:
            # find closest distribution values
            temp_scaffold_len = gc_dist[closest_gc].keys()[0]
            d = gc_dist[closest_gc][temp_scaffold_len]
            gc_lower_bound_key = find_nearest(d.keys(),
                                              (100 - percentile) / 2.0)
            gc_upper_bound_key = find_nearest(d.keys(),
                                              (100 + percentile) / 2.0)

            xL = []
            xU = []
            y = []
            for window_size in gc_dist[closest_gc]:
                    gc_dist[closest_gc][window_size][gc_lower_bound_key] * 100)
                    gc_dist[closest_gc][window_size][gc_upper_bound_key] * 100)
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            xL = np.array(xL)[sort_indexY]
            xU = np.array(xU)[sort_indexY]
            y = np.array(y)[sort_indexY]
            axes_scatter.plot(xL, y, 'r--', lw=1.0, zorder=0)
            axes_scatter.plot(xU, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter,
            mpld3.plugins.connect(figure, tooltip)

        return scatter, x_pts, y_pts, self.plot_order(plot_labels)
    def plot_on_axes(self, figure,
                     highlight_scaffold_ids, link_scaffold_ids,
                     mean_signature, td_dist, percentiles_to_plot,
                     axes_hist, axes_scatter, tooltip_plugin):
        """Create histogram and scatterplot.

        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_signature : float
          Mean tetranucleotide signature of genome.
        td_dist : d[length][percentile] -> critical value
          TD distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.

        # histogram plot
        genomic_signature = GenomicSignature(0)

        delta_tds = []
        for stats in genome_scaffold_stats.values():
            delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature))

        if axes_hist:
            axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5))
            axes_hist.set_xlabel('tetranucleotide distance')
            axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds))

        # scatterplot
        xlabel = 'tetranucleotide distance'
        ylabel = 'Scaffold length (kbp)'

        pts = self.data_pts(genome_scaffold_stats, mean_signature)
        scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
                                                             xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # plot reference distributions
        for percentile in percentiles_to_plot:
            # find closest distribution values
            td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile)

            x = []
            y = []
            for window_size in td_dist:
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            x = np.array(x)[sort_indexY]
            y = np.array(y)[sort_indexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # prettify scatterplot

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter, labels=plot_labels, hoffset=5, voffset=-15)
            mpld3.plugins.connect(figure, tooltip)

        return scatter, x_pts, y_pts, self.plot_order(plot_labels)