Beispiel #1
0
    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus

        self.signatures = GenomicSignature(4)
Beispiel #2
0
    def __init__(self, k, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.k = k
        self.cpus = cpus

        self.logger.info('Calculating unique kmers of size k = %d.' % self.k)
        self.signatures = GenomicSignature(self.k)
Beispiel #3
0
    def data_pts(self, genome_scaffold_stats, mean_signature):
        """Get data points to plot.

        Parameters
        ----------
        genome_scaffold_stats : d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
          
        Returns
        -------
        dict : d[scaffold_id] -> (x, y)
        """
        
        genomic_signature = GenomicSignature(0)

        pts = {}
        for scaffold_id, stats in genome_scaffold_stats.iteritems():
            pts[scaffold_id] = (genomic_signature.manhattan(stats.signature, mean_signature), 
                                stats.length / 1000.0)
            
        return pts
Beispiel #4
0
    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.cpus = cpus

        self.signatures = GenomicSignature(4)
Beispiel #5
0
    def __init__(self, k, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.k = k
        self.cpus = cpus

        self.logger.info('  Calculating unique kmers of size k = %d.' % self.k)
        self.signatures = GenomicSignature(self.k)
Beispiel #6
0
    def run(self, scaffold_stats):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        """

        self.logger.info(
            "Calculating statistics for {:,} genomes over {:,} scaffolds.".
            format(scaffold_stats.num_genomes(),
                   scaffold_stats.num_scaffolds()))

        self.coverage_headers = scaffold_stats.coverage_headers
        self.signature_headers = scaffold_stats.signature_headers

        genome_size = defaultdict(int)
        scaffold_length = defaultdict(list)
        gc = defaultdict(list)
        coverage = defaultdict(list)
        signature = defaultdict(list)
        for _scaffold_id, stats in scaffold_stats.stats.items():
            if stats.genome_id == scaffold_stats.unbinned:
                continue

            genome_size[stats.genome_id] += stats.length
            scaffold_length[stats.genome_id].append(stats.length)
            gc[stats.genome_id].append(stats.gc)
            coverage[stats.genome_id].append(stats.coverage)
            signature[stats.genome_id].append(stats.signature)

        # record statistics for each genome
        genomic_signature = GenomicSignature(0)

        self.genome_stats = {}
        for genome_id in genome_size:
            # calculate weighted mean and median statistics
            weights = np_array(scaffold_length[genome_id])

            len_array = np_array(scaffold_length[genome_id])
            mean_len = ws.numpy_weighted_mean(len_array, weights)
            median_len = ws.numpy_weighted_median(len_array, weights)

            gc_array = np_array(gc[genome_id])
            mean_gc = ws.numpy_weighted_mean(gc_array, weights)
            median_gc = ws.numpy_weighted_median(gc_array, weights)

            cov_array = np_array(coverage[genome_id]).T
            mean_cov = ws.numpy_weighted_mean(cov_array, weights)
            median_cov = []
            for i in range(cov_array.shape[0]):
                median_cov.append(
                    ws.numpy_weighted_median(cov_array[i, :], weights))

            signature_array = np_array(signature[genome_id]).T
            mean_signature = ws.numpy_weighted_mean(signature_array, weights)

            # calculate mean and median tetranucleotide distance
            td = []
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                stats = scaffold_stats.stats[scaffold_id]
                td.append(
                    genomic_signature.manhattan(stats.signature,
                                                mean_signature))

            self.genome_stats[genome_id] = self.GenomeStats(
                genome_size[genome_id], mean_len, median_len, mean_gc,
                median_gc, mean_cov, median_cov, mean_signature, np_mean(td),
                np_median(td))

        return self.genome_stats
Beispiel #7
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""
    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus

        self.signatures = GenomicSignature(4)

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()

    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.

        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.

        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).

        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        if self.logger.is_silent:
            return None
        else:
            return '  Finished processing %d of %d (%.2f%%) sequences.' % (
                processed_items, total_items,
                float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info(
            'Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer,
                                                seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError(
                        "[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns."
                        % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [
                    kmer_order[i] for i in canonical_order_index
                ]

                if canonical_order != self.canonical_order():
                    raise ParsingError(
                        "[Error] Failed to process tetranucleotide signature file: "
                        + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [
                        float(line_split[i + 1]) for i in canonical_order_index
                    ]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """

        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Beispiel #8
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""

    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.cpus = cpus

        self.signatures = GenomicSignature(4)
        

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()


    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.
        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.
        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.
         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).
        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '    Finished processing %d of %d (%.2f%%) sequences.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info('  Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError("[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [kmer_order[i] for i in canonical_order_index]

                if canonical_order != self.canonical_order():
                    raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """
        #singlegenome.write_links(output_file)
        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Beispiel #9
0
    def outlier_info(self,
                        genome_id, 
                        scaffold_ids, 
                        scaffold_stats, 
                        genome_stats,
                        gc_per,
                        td_per,
                        cov_corr,
                        cov_perc):

        genomic_signature = GenomicSignature(0)
        
        # make sure distributions have been loaded
        self.read_distributions()
        
        # find keys into GC and TD distributions
        # gc -> [mean GC][scaffold length][percentile]
        # td -> [scaffold length][percentile]
        gs = genome_stats[genome_id]
        closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
        sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
        d = self.gc_dist[closest_gc][sample_seq_len]
        gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
        gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

        td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)
        
        outlying_stats = {}
        outlying_dists = defaultdict(list)
        for scaffold_id in scaffold_ids:
            base_scaffold_id = scaffold_id
            if '-#' in scaffold_id:
                base_scaffold_id = base_scaffold_id[0:base_scaffold_id.rfind('-#')]
            stats = scaffold_stats.stats[base_scaffold_id]

            # find GC and TD bounds
            closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), stats.length)
            gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
            gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

            closest_seq_len = find_nearest(list(self.td_dist.keys()), stats.length)
            td_bound = self.td_dist[closest_seq_len][td_bound_key]

            # find changes from median
            delta_gc = (stats.gc - gs.median_gc) / 100.0
            delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature)

            # determine if scaffold is an outlier
            if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound:
                outlying_dists[scaffold_id].append('GC')

            if delta_td > td_bound:
                outlying_dists[scaffold_id].append('TD')
                
            # care is required for coverage, since this information
            # is not always provided
            if len(gs.median_coverage) >= 1: 
                # there is coverage information
                mean_genome_cov = np_mean(gs.median_coverage)
                
                if len(stats.coverage) == 0:
                    # however, this scaffold has no reported 
                    # coverage so flag it as a likely outlier
                    mean_scaffold_cov = 0
                    corr_r = -1000
                    mean_cp_err = -1000
                    outlying_dists[scaffold_id].append('COV_CORR')
                    outlying_dists[scaffold_id].append('COV_PERC')
                else:     
                    mean_scaffold_cov = np_mean(stats.coverage)

                    corr_r = 1.0
                    if len(gs.median_coverage) > 1:
                        try:
                            corr_r, _corr_p = pearsonr(gs.median_coverage, stats.coverage)
                            if corr_r < cov_corr:
                                outlying_dists[scaffold_id].append('COV_CORR')
                        except:
                            self.logger.warning('Failed to calculate Pearson correlation for %s.' % scaffold_id)
                            if sum(gs.median_coverage) == 0:
                                self.logger.warning('Median coverage of %s is zero across all samples.' % genome_id)
                            if sum(stats.coverage) == 0:
                                self.logger.warning('Contig %s has zero coverage across all samples.' % scaffold_id)

                    mean_cp_err = []
                    for cov_genome, cov_scaffold in zip(gs.median_coverage, stats.coverage):
                        mean_cp_err.append(abs(cov_scaffold - cov_genome) * 100.0 / max(cov_genome, self.min_required_coverage))
                            
                    mean_cp_err = np_mean(mean_cp_err)                        
                    if mean_cp_err > cov_perc:
                        outlying_dists[scaffold_id].append('COV_PERC')
            else:
                # no coverage information was provided
                mean_genome_cov = 0
                mean_scaffold_cov = 0
                corr_r = 1.0
                mean_cp_err = 0.0
                
                
            outlying_stats[scaffold_id] = self.OutlierInfo(stats.length,
                                                            stats.gc,
                                                            gs.median_gc,
                                                            gs.median_gc + gc_lower_bound * 100,
                                                            gs.median_gc + gc_upper_bound * 100,
                                                            delta_td,
                                                            gs.median_td,
                                                            td_bound,
                                                            mean_scaffold_cov,
                                                            mean_genome_cov,
                                                            corr_r,
                                                            mean_cp_err)
        
        return outlying_stats, outlying_dists
Beispiel #10
0
    def compatible(self, scaffolds_of_interest,
                        scaffold_stats,
                        genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with compatible genomic characteristics.

        Compatible scaffolds are identified based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        Parameters
        ----------
        scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology]
            Scaffolds to consider for compatibility.
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds to check.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int
            Percentile for identifying GC outliers.
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.
        """

        # read reference distributions from file
        self.logger.info('')
        self.logger.info('  Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify compatible scaffolds in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions')
        fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error')
        fout.write('\t# genes\t% genes with homology\n')

        genomic_signature = GenomicSignature(0)

        self.logger.info('  Identifying scaffolds compatible with bins.')
        processed_scaffolds = 0
        for scaffold_id, ss in scaffold_stats.stats.iteritems():
            processed_scaffolds += 1
            sys.stdout.write('    Processed %d of %d (%.1f%%) scaffolds.\r' % (processed_scaffolds,
                                                                         len(scaffold_stats.stats),
                                                                         processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
            sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.iteritems():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0)
                sample_seq_len = self.gc_dist[closest_gc].keys()[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(self.td_dist.keys(), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.mean_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)

                # determine if scaffold compatible
                compatible_dists = []
                if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound:
                    compatible_dists.append('GC')

                if delta_td <= td_bound:
                    compatible_dists.append('TD')

                corr_r = 1.0
                if len(gs.mean_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.mean_coverage, ss.coverage)
                    if  corr_r >= cov_corr:
                        compatible_dists.append('COV_CORR')

                mean_cp = []
                for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, ss.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome)

                mean_cp = np_mean(mean_cp)
                if mean_cp <= cov_perc:
                    compatible_dists.append('COV_PERC')

                # report compatible scaffolds
                if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp))
                    fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1]))
                    fout.write('\n')

        sys.stdout.write('\n')
        fout.close()
Beispiel #11
0
    def identify(self, scaffold_stats, genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with divergent genomic characteristics.

        Outliers are identified independently based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int.
            Percentile for identifying GC outliers
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.
        """

        # read reference distributions from file
        self.logger.info('  Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify outliers in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tOutlying distributions')
        fout.write('\tScaffold GC\tMean genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMean genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tMean scaffold coverage\tMean genome coverage\tCoverage correlation\tMean coverage error\n')

        genomic_signature = GenomicSignature(0)

        processed_genomes = 0
        for genome_id, scaffold_ids in scaffold_stats.scaffolds_in_genome.iteritems():
            processed_genomes += 1

            sys.stdout.write('    Finding outliers in %d of %d (%.1f%%) genomes.\r' % (processed_genomes,
                                                                                     scaffold_stats.num_genomes(),
                                                                                     processed_genomes * 100.0 / scaffold_stats.num_genomes()))
            sys.stdout.flush()

            # find keys into GC and TD distributions
            # gc -> [mean GC][scaffold length][percentile]
            # td -> [scaffold length][percentile]
            gs = genome_stats[genome_id]
            closest_gc = find_nearest(self.gc_dist.keys(), gs.mean_gc / 100.0)
            sample_seq_len = self.gc_dist[closest_gc].keys()[0]
            d = self.gc_dist[closest_gc][sample_seq_len]
            gc_lower_bound_key = find_nearest(d.keys(), (100 - gc_per) / 2.0)
            gc_upper_bound_key = find_nearest(d.keys(), (100 + gc_per) / 2.0)

            td_bound_key = find_nearest(self.td_dist[self.td_dist.keys()[0]].keys(), td_per)

            for scaffold_id in scaffold_ids:
                stats = scaffold_stats.stats[scaffold_id]

                # find GC and TD bounds
                closest_seq_len = find_nearest(self.gc_dist[closest_gc].keys(), stats.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(self.td_dist.keys(), stats.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (stats.gc - gs.mean_gc) / 100.0
                delta_td = genomic_signature.manhattan(stats.signature, gs.mean_signature)

                # determine if scaffold is an outlier
                outlying_dists = []
                if delta_gc < gc_lower_bound or delta_gc > gc_upper_bound:
                    outlying_dists.append('GC')

                if delta_td > td_bound:
                    outlying_dists.append('TD')

                corr_r = 1.0
                if len(gs.mean_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.mean_coverage, stats.coverage)
                    if  corr_r < cov_corr:
                        outlying_dists.append('COV_CORR')

                mean_cp = []
                for cov_genome, cov_scaffold in itertools.izip(gs.mean_coverage, stats.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_scaffold - cov_genome) * 100.0 / cov_genome)

                if len(mean_cp) == 0:
                    # genome has zero coverage which is general
                    # will indicate something is wrong
                    mean_cp = -1
                    outlying_dists.append('COV_PERC')
                else:
                    mean_cp = np_mean(mean_cp)
                    if mean_cp > cov_perc:
                        outlying_dists.append('COV_PERC')

                # report outliers
                if (report_type == 'any' and len(outlying_dists) >= 1) or (report_type == 'all' and len(outlying_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, stats.length, ','.join(outlying_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (stats.gc, gs.mean_gc, gs.mean_gc + gc_lower_bound * 100, gs.mean_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.mean_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(stats.coverage), np_mean(gs.mean_coverage), corr_r, mean_cp))
                    fout.write('\n')

        sys.stdout.write('\n')
        fout.close()
Beispiel #12
0
    def kmeans(self, scaffold_stats, num_clusters, num_components, K,
               no_coverage, no_pca, iterations, genome_file, output_dir):
        """Cluster genome with k-means.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of signatures
        if K != 0:
            if not no_pca:
                self.logger.info('Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info(
                    'First {:,} PCs capture {:.1f}% of the variance.'.format(
                        num_components,
                        sum(variance[0:num_components]) * 100))

                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            self.logger.info('Whitening data.')
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info(
            'Partitioning genome into {:,} clusters.'.format(num_clusters))

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats,
                                             num_clusters,
                                             iterations,
                                             minit='points',
                                             missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('Placed {:,} sequences in cluster {:,}.'.format(
                sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(
                os.path.join(output_dir,
                             genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Beispiel #13
0
    def split(self, scaffold_stats, criteria1, criteria2, genome_file,
              output_dir):
        """Split genome into two based ongenomic feature.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        criteria1 : str
            First criteria used for splitting genome.
        criteria2 : str
           Second criteria used for splitting genome.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        seqs = seq_io.read(genome_file)

        # calculate PCA if necessary
        if 'pc' in criteria1 or 'pc' in criteria2:
            self.logger.info('Performing PCA.')
            signatures = GenomicSignature(K)
            signature_matrix = []
            seqs = seq_io.read(genome_file)
            for seq_id, seq in seqs.items():
                stats = scaffold_stats.stats[seq_id]

                signature_matrix.append(stats.signature)

            pc, _variance = self.pca(signature_matrix)
            for i, seq_id in enumerate(seqs):
                scaffold_stats.stats[seq_id].pc1 = pc[i][0]
                scaffold_stats.stats[seq_id].pc2 = pc[i][1]
                scaffold_stats.stats[seq_id].pc3 = pc[i][2]

        # split bin
        genome_id = remove_extension(genome_file)
        fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
        fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')

        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            meet_criteria = True
            for criteria in [criteria1, criteria2]:
                if 'gc' in criteria:
                    v = eval(criteria.replace('gc', str(stats.gc)),
                             {"__builtins__": {}})
                elif 'coverage' in criteria:
                    v = eval(criteria.replace('coverage', str(stats.coverage)),
                             {"__builtins__": {}})
                elif 'pc1' in criteria:
                    v = eval(criteria.replace('pc1', str(stats.pc1)),
                             {"__builtins__": {}})
                elif 'pc2' in criteria:
                    v = eval(criteria.replace('pc2', str(stats.pc2)),
                             {"__builtins__": {}})
                elif 'pc3' in criteria:
                    v = eval(criteria.replace('pc3', str(stats.pc3)),
                             {"__builtins__": {}})

                meet_criteria = meet_criteria and v

            if meet_criteria:
                fout1.write('>' + seq_id + '\n')
                fout1.write(seqs[seq_id] + '\n')
            else:
                fout2.write('>' + seq_id + '\n')
                fout2.write(seqs[seq_id] + '\n')

        fout1.close()
        fout2.close()
Beispiel #14
0
class KmerUsage(object):
    """Calculate kmer usage over a set of genomes.

    The implementation for calculating genomic signatures
    is not optimized for speed. As such, this class is
    useful for k <= 8.
    """

    def __init__(self, k, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.k = k
        self.cpus = cpus

        self.logger.info('  Calculating unique kmers of size k = %d.' % self.k)
        self.signatures = GenomicSignature(self.k)

    def _producer(self, genome_file):
        """Calculates kmer usage of a genome.

        Parameters
        ----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.calculate(seqs)

        return (genome_id, kmer_usage)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> [genome_id, kmer_usage]
            Unique id of a genome followed by a dictionary
            indicating its kmer usage.
        consumer_data : namedtuple
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).

        Returns
        -------
        consumer_data
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            # setup data to be returned by consumer
            ConsumerData = namedtuple('ConsumerData', 'kmer_set genome_kmer_usage')
            consumer_data = ConsumerData(set(), dict())

        genome_id, kmer_usage = produced_data

        consumer_data.kmer_set.update(kmer_usage.keys())
        consumer_data.genome_kmer_usage[genome_id] = kmer_usage

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of genomes processed.
        total_items : int
            Total number of genomes to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '    Finished processing %d of %d (%.2f%%) genomes.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, genome_files):
        """Calculate kmer usage over a set of genomes.

        Parameters
        ----------
        genome_files : list
            Fasta files containing genomic sequences in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][kmer] -> count
           Kmer usage of each genome.
        set
           Set with all identified kmers.
        """

        self.logger.info('  Calculating kmer usage for each genome.')

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer, genome_files, self._progress)

        return consumer_data.genome_kmer_usage, consumer_data.kmer_set
Beispiel #15
0
    def plot_on_axes(self, figure,
                     genome_scaffold_stats,
                     highlight_scaffold_ids, link_scaffold_ids,
                     mean_signature, td_dist, percentiles_to_plot,
                     axes_hist, axes_scatter, tooltip_plugin):
        """Create histogram and scatterplot.

        Parameters
        ----------
        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_signature : float
          Mean tetranucleotide signature of genome.
        td_dist : d[length][percentile] -> critical value
          TD distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.
        """

        # histogram plot
        genomic_signature = GenomicSignature(0)

        delta_tds = []
        for stats in genome_scaffold_stats.values():
            delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature))

        if axes_hist:
            axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5))
            axes_hist.set_xlabel('tetranucleotide distance')
            axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds))
            self.prettify(axes_hist)

        # scatterplot
        xlabel = 'tetranucleotide distance'
        ylabel = 'Scaffold length (kbp)'

        scaffold_stats = {}
        for i, (scaffold_id, stats) in enumerate(genome_scaffold_stats.iteritems()):
            scaffold_stats[scaffold_id] = (delta_tds[i], stats.length / 1000.0)

        scatter, labels = self.scatter(axes_scatter,
                                         scaffold_stats,
                                         highlight_scaffold_ids,
                                         link_scaffold_ids,
                                         xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # plot reference distributions
        for percentile in percentiles_to_plot:
            # find closest distribution values
            td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile)

            x = []
            y = []
            for window_size in td_dist:
                x.append(td_dist[window_size][td_bound_key])
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            x = np.array(x)[sort_indexY]
            y = np.array(y)[sort_indexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # prettify scatterplot
        self.prettify(axes_scatter)

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter, labels=labels, hoffset=5, voffset=-15)
            mpld3.plugins.connect(figure, tooltip)

        return scatter
Beispiel #16
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Beispiel #17
0
class KmerUsage(object):
    """Calculate kmer usage over a set of genomes.

    The implementation for calculating genomic signatures
    is not optimized for speed. As such, this class is
    useful for k <= 8.
    """

    def __init__(self, k, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.k = k
        self.cpus = cpus

        self.logger.info('Calculating unique kmers of size k = %d.' % self.k)
        self.signatures = GenomicSignature(self.k)

    def _producer(self, genome_file):
        """Calculates kmer usage of a genome.

        Parameters
        ----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.counts(seqs)

        return (genome_id, kmer_usage)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> [genome_id, kmer_usage]
            Unique id of a genome followed by a dictionary
            indicating its kmer usage.

        Returns
        -------
        consumer_data
            dictionary indicating the frequency of kmers in each genome
        """

        if consumer_data == None:
            consumer_data = defaultdict(dict)

        genome_id, kmer_usage = produced_data
        
        for idx, kmer in enumerate(self.signatures.canonical_order()):
            consumer_data[genome_id][kmer] = kmer_usage[idx]

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of genomes processed.
        total_items : int
            Total number of genomes to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '  Finished processing %d of %d (%.2f%%) genomes.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, genome_files):
        """Calculate kmer usage over a set of genomes.

        Parameters
        ----------
        genome_files : list
            Fasta files containing genomic sequences in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][kmer] -> count
           Kmer usage of each genome.
        set
           Set with all identified kmers.
        """

        self.logger.info('Calculating kmer usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return kmer_counts, self.signatures.canonical_order()
Beispiel #18
0
    def compatible(self, scaffolds_of_interest,
                        scaffold_stats,
                        genome_stats,
                        gc_per, td_per,
                        cov_corr, cov_perc,
                        report_type, output_file):
        """Identify scaffolds with compatible genomic characteristics.

        Compatible scaffolds are identified based on GC content,
        tetranucleotide signatures, coverage profile correlation, and
        mean absolute percent error of coverage profile. The coverage correlation
        check is ignored if the coverage profile consists of a single value.

        Parameters
        ----------
        scaffolds_of_interest : d[scaffold_id] -> [no. genes, perc. genes with homology]
            Scaffolds to consider for compatibility.
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds to check.
        genome_stats : GenomeStats
            Statistics for individual genomes.
        gc_per : int
            Percentile for identifying GC outliers.
        td_per : int
            Percentile for identifying TD outliers.
        cov_corr : int
            Correlation for identifying divergent coverage profiles.
        cov_perc : int
            Mean absolute percent error for identifying divergent coverage profiles.
        report_type : str
            Report scaffolds that are outliers in 'all' or 'any' distribution.
        output_file : str
            Name of output file.
        """

        # read reference distributions from file
        self.logger.info('Reading reference distributions.')
        self.gc_dist = self._read_distribution('gc_dist')
        self.td_dist = self._read_distribution('td_dist')

        # identify compatible scaffolds in each genome
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome id\tScaffold length (bp)\tCompatible distributions')
        fout.write('\tScaffold GC\tMedian genome GC\tLower GC bound (%s%%)\tUpper GC bound (%s%%)' % (gc_per, gc_per))
        fout.write('\tScaffold TD\tMedian genome TD\tUpper TD bound (%s%%)' % td_per)
        fout.write('\tScaffold coverage\tMedian genome coverage\tCoverage correlation\tCoverage error')
        fout.write('\t# genes\t% genes with homology\n')

        genomic_signature = GenomicSignature(0)

        self.logger.info('Identifying scaffolds compatible with bins.')
        processed_scaffolds = 0
        for scaffold_id, ss in scaffold_stats.stats.items():
            processed_scaffolds += 1
            if not self.logger.is_silent:
                sys.stdout.write('  Processed {:,} of {:,} ({:.1f}%) scaffolds.\r'.format(
                                    processed_scaffolds,
                                    len(scaffold_stats.stats),
                                    processed_scaffolds * 100.0 / len(scaffold_stats.stats)))
                sys.stdout.flush()

            if scaffold_id not in scaffolds_of_interest:
                continue

            for genome_id, gs in genome_stats.items():
                # find keys into GC and TD distributions
                # gc -> [mean GC][scaffold length][percentile]
                # td -> [scaffold length][percentile]
                closest_gc = find_nearest(list(self.gc_dist.keys()), gs.median_gc / 100.0)
                sample_seq_len = list(self.gc_dist[closest_gc].keys())[0]
                d = self.gc_dist[closest_gc][sample_seq_len]
                gc_lower_bound_key = find_nearest(list(d.keys()), (100 - gc_per) / 2.0)
                gc_upper_bound_key = find_nearest(list(d.keys()), (100 + gc_per) / 2.0)

                td_bound_key = find_nearest(list(self.td_dist[list(self.td_dist.keys())[0]].keys()), td_per)

                # find GC and TD bounds
                closest_seq_len = find_nearest(list(self.gc_dist[closest_gc].keys()), ss.length)
                gc_lower_bound = self.gc_dist[closest_gc][closest_seq_len][gc_lower_bound_key]
                gc_upper_bound = self.gc_dist[closest_gc][closest_seq_len][gc_upper_bound_key]

                closest_seq_len = find_nearest(list(self.td_dist.keys()), ss.length)
                td_bound = self.td_dist[closest_seq_len][td_bound_key]

                # find changes from mean
                delta_gc = (ss.gc - gs.median_gc) / 100.0
                delta_td = genomic_signature.manhattan(ss.signature, gs.mean_signature)

                # determine if scaffold compatible
                compatible_dists = []
                if delta_gc >= gc_lower_bound and delta_gc <= gc_upper_bound:
                    compatible_dists.append('GC')

                if delta_td <= td_bound:
                    compatible_dists.append('TD')

                corr_r = 1.0
                if len(gs.median_coverage) > 1:
                    corr_r, _corr_p = pearsonr(gs.median_coverage, ss.coverage)
                    if  corr_r >= cov_corr:
                        compatible_dists.append('COV_CORR')

                mean_cp = []
                for cov_genome, cov_scaffold in zip(gs.median_coverage, ss.coverage):
                    if cov_genome >= self.min_required_coverage:
                        mean_cp.append(abs(cov_genome - cov_scaffold) * 100.0 / cov_genome)

                mean_cp = np_mean(mean_cp)
                if mean_cp <= cov_perc:
                    compatible_dists.append('COV_PERC')

                # report compatible scaffolds
                if (report_type == 'any' and len(compatible_dists) >= 1) or (report_type == 'all' and len(compatible_dists) >= 3):
                    fout.write('%s\t%s\t%s\t%s' % (scaffold_id, genome_id, ss.length, ','.join(compatible_dists)))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (ss.gc, gs.median_gc, gs.median_gc + gc_lower_bound * 100, gs.median_gc + gc_upper_bound * 100))
                    fout.write('\t%.3f\t%.3f\t%.3f' % (delta_td, gs.median_td, td_bound))
                    fout.write('\t%.2f\t%.2f\t%.2f\t%.2f' % (np_mean(ss.coverage), np_mean(gs.median_coverage), corr_r, mean_cp))
                    fout.write('\t%d\t%.1f' % (scaffolds_of_interest[scaffold_id][0], scaffolds_of_interest[scaffold_id][1]))
                    fout.write('\n')

        if not self.logger.is_silent:
            sys.stdout.write('\n')
            
        fout.close()
Beispiel #19
0
    def plot_on_axes(self, figure,
                     genome_scaffold_stats,
                     highlight_scaffold_ids, link_scaffold_ids,
                     mean_signature, td_dist, percentiles_to_plot,
                     axes_hist, axes_scatter, tooltip_plugin):
        """Create histogram and scatterplot.

        Parameters
        ----------
        figure : matplotlib.figure
          Figure on which to render axes.
        genome_scaffold_stats: d[scaffold_id] -> namedtuple of scaffold stats
          Statistics for scaffolds in genome.
        highlight_scaffold_ids : d[scaffold_id] -> color
            Scaffolds in genome to highlight.
        link_scaffold_ids : list of scaffold pairs
            Pairs of scaffolds to link together.
        mean_signature : float
          Mean tetranucleotide signature of genome.
        td_dist : d[length][percentile] -> critical value
          TD distribution.
        percentiles_to_plot : iterable
          Percentile values to mark on plot.
        """

        # histogram plot
        genomic_signature = GenomicSignature(0)

        delta_tds = []
        for stats in genome_scaffold_stats.values():
            delta_tds.append(genomic_signature.manhattan(stats.signature, mean_signature))

        if axes_hist:
            axes_hist.hist(delta_tds, bins=20, color=(0.5, 0.5, 0.5))
            axes_hist.set_xlabel('tetranucleotide distance')
            axes_hist.set_ylabel('# scaffolds (out of %d)' % len(delta_tds))
            self.prettify(axes_hist)

        # scatterplot
        xlabel = 'tetranucleotide distance'
        ylabel = 'Scaffold length (kbp)'

        pts = self.data_pts(genome_scaffold_stats, mean_signature)
            
        scatter, x_pts, y_pts, plot_labels = self.scatter(axes_scatter,
                                                             pts,
                                                             highlight_scaffold_ids,
                                                             link_scaffold_ids,
                                                             xlabel, ylabel)

        _, ymax = axes_scatter.get_ylim()
        xmin, xmax = axes_scatter.get_xlim()

        # plot reference distributions
        for percentile in percentiles_to_plot:
            # find closest distribution values
            td_bound_key = find_nearest(td_dist[td_dist.keys()[0]].keys(), percentile)

            x = []
            y = []
            for window_size in td_dist:
                x.append(td_dist[window_size][td_bound_key])
                y.append(window_size / 1000.0)

            # sort by y-values
            sort_indexY = np.argsort(y)
            x = np.array(x)[sort_indexY]
            y = np.array(y)[sort_indexY]

            # make sure x-values are strictly decreasing as y increases
            # as this is conservative and visually satisfying
            for i in xrange(0, len(x) - 1):
                for j in xrange(i + 1, len(x)):
                    if x[j] > x[i]:
                        if j == len(x) - 1:
                            x[j] = x[i]
                        else:
                            x[j] = (x[j - 1] + x[j + 1]) / 2  # interpolate values from neighbours

                        if x[j] > x[i]:
                            x[j] = x[i]

            axes_scatter.plot(x, y, 'r--', lw=1.0, zorder=0)

        # ensure y-axis include zero and covers all sequences
        axes_scatter.set_ylim([0, ymax])

        # ensure x-axis is set appropriately for sequences
        axes_scatter.set_xlim([xmin, xmax])

        # prettify scatterplot
        self.prettify(axes_scatter)

        # tooltips plugin
        if tooltip_plugin:
            tooltip = Tooltip(scatter, labels=plot_labels, hoffset=5, voffset=-15)
            mpld3.plugins.connect(figure, tooltip)

        return scatter, x_pts, y_pts, self.plot_order(plot_labels)