Ejemplo n.º 1
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
Ejemplo n.º 2
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""

    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger()

        self.cpus = cpus

        self.signatures = GenomicSignature(4)
        

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()


    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.
        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.
        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.
         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).
        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        return '    Finished processing %d of %d (%.2f%%) sequences.' % (processed_items, total_items, float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info('  Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError("[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [kmer_order[i] for i in canonical_order_index]

                if canonical_order != self.canonical_order():
                    raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """
        #singlegenome.write_links(output_file)
        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Ejemplo n.º 3
0
class Tetranucleotide(object):
    """Calculate tetranucleotide signature of sequences."""
    def __init__(self, cpus=1):
        """Initialization.

        Parameters
        ----------
        cpus : int
            Number of cpus to use.
        """
        self.logger = logging.getLogger('timestamp')

        self.cpus = cpus

        self.signatures = GenomicSignature(4)

    def canonical_order(self):
        """Canonical order of tetranucleotides."""
        return self.signatures.canonical_order()

    def _producer(self, seq_info):
        """Calculate tetranucleotide signature of a sequence.

        Parameters
        ----------
        seq_id : str
            Unique id of sequence.
        seq : str
            Sequence in nuceltoide space.

        Returns
        -------
        str
            Unique id of sequence.
        list
            Count of each kmer in the canonical order.
        """

        seq_id, seq = seq_info

        sig = self.signatures.seq_signature(seq)

        total_kmers = sum(sig)
        for i in xrange(0, len(sig)):
            sig[i] = float(sig[i]) / total_kmers

        return (seq_id, sig)

    def _consumer(self, produced_data, consumer_data):
        """Consume results from producer processes.

         Parameters
        ----------
        produced_data : list -> kmers in the canonical order
            Tetranucleotide signature in canconical order.
        consumer_data : d[seq_id] -> tetranucleotide signature
            Set of kmers observed across all genomes (kmer_set),
            along with the kmer usage of each genome (genome_kmer_usage).

        Returns
        -------
        consumer_data: dict
            The consumer data structure or None must be returned
        """

        if consumer_data == None:
            consumer_data = {}

        seq_id, sig = produced_data
        consumer_data[seq_id] = sig

        return consumer_data

    def _progress(self, processed_items, total_items):
        """Report progress of consumer processes.

        Parameters
        ----------
        processed_items : int
            Number of sequences processed.
        total_items : int
            Total number of sequences to process.

        Returns
        -------
        str
            String indicating progress of data processing.
        """

        if self.logger.is_silent:
            return None
        else:
            return '  Finished processing %d of %d (%.2f%%) sequences.' % (
                processed_items, total_items,
                float(processed_items) * 100 / total_items)

    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info(
            'Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer,
                                                seq_file, self._progress)

        return seq_signatures

    def read(self, signature_file):
        """Read tetranucleotide signatures.

        Parameters
        ----------
        signature_file : str
            Name of file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        try:
            sig = {}
            with open(signature_file) as f:
                header = f.readline().split('\t')
                kmer_order = [x.strip().upper() for x in header[1:]]
                if len(kmer_order) != len(self.canonical_order()):
                    raise ParsingError(
                        "[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns."
                        % len(self.canonical_order()))

                canonical_order_index = np.argsort(kmer_order)
                canonical_order = [
                    kmer_order[i] for i in canonical_order_index
                ]

                if canonical_order != self.canonical_order():
                    raise ParsingError(
                        "[Error] Failed to process tetranucleotide signature file: "
                        + signature_file)

                for line in f:
                    line_split = line.split('\t')
                    sig[line_split[0]] = [
                        float(line_split[i + 1]) for i in canonical_order_index
                    ]

            return sig
        except IOError:
            print '[Error] Failed to open signature file: %s' % signature_file
            sys.exit()
        except ParsingError:
            sys.exit()

    def write(self, signatures, output_file):
        """Write tetranucleotide signatures.

        Parameters
        ----------
        signature_file : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        output_file : str
            Name of output file.
        """

        fout = open(output_file, 'w')

        fout.write('Scaffold id')
        for kmer in self.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for seq_id, tetra_signature in signatures.iteritems():
            fout.write(seq_id + '\t')
            fout.write('\t'.join(map(str, tetra_signature)))
            fout.write('\n')

        fout.close()
Ejemplo n.º 4
0
    def kmeans(self, scaffold_stats, num_clusters, num_components, K,
               no_coverage, no_pca, iterations, genome_file, output_dir):
        """Cluster genome with k-means.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of signatures
        if K != 0:
            if not no_pca:
                self.logger.info('Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info(
                    'First {:,} PCs capture {:.1f}% of the variance.'.format(
                        num_components,
                        sum(variance[0:num_components]) * 100))

                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            self.logger.info('Whitening data.')
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info(
            'Partitioning genome into {:,} clusters.'.format(num_clusters))

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats,
                                             num_clusters,
                                             iterations,
                                             minit='points',
                                             missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('Placed {:,} sequences in cluster {:,}.'.format(
                sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(
                os.path.join(output_dir,
                             genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()