class Tetranucleotide(object): """Calculate tetranucleotide signature of sequences.""" def __init__(self, cpus=1): """Initialization. Parameters ---------- cpus : int Number of cpus to use. """ self.logger = logging.getLogger() self.cpus = cpus self.signatures = GenomicSignature(4) def canonical_order(self): """Canonical order of tetranucleotides.""" return self.signatures.canonical_order() def _producer(self, seq_info): """Calculate tetranucleotide signature of a sequence. Parameters ---------- seq_id : str Unique id of sequence. seq : str Sequence in nuceltoide space. Returns ------- str Unique id of sequence. list Count of each kmer in the canonical order. """ seq_id, seq = seq_info sig = self.signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers return (seq_id, sig) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. Parameters ---------- produced_data : list -> kmers in the canonical order Tetranucleotide signature in canconical order. consumer_data : d[seq_id] -> tetranucleotide signature Set of kmers observed across all genomes (kmer_set), along with the kmer usage of each genome (genome_kmer_usage). Returns ------- consumer_data: dict The consumer data structure or None must be returned """ if consumer_data == None: consumer_data = {} seq_id, sig = produced_data consumer_data[seq_id] = sig return consumer_data def _progress(self, processed_items, total_items): """Report progress of consumer processes. Parameters ---------- processed_items : int Number of sequences processed. total_items : int Total number of sequences to process. Returns ------- str String indicating progress of data processing. """ return ' Finished processing %d of %d (%.2f%%) sequences.' % (processed_items, total_items, float(processed_items) * 100 / total_items) def run(self, seq_file): """Calculate tetranucleotide signatures of sequences. Parameters ---------- seq_file : str Name of fasta/q file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ self.logger.info(' Calculating tetranucleotide signature for each sequence:') parallel = Parallel(self.cpus) seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress) return seq_signatures def read(self, signature_file): """Read tetranucleotide signatures. Parameters ---------- signature_file : str Name of file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ try: sig = {} with open(signature_file) as f: header = f.readline().split('\t') kmer_order = [x.strip().upper() for x in header[1:]] if len(kmer_order) != len(self.canonical_order()): raise ParsingError("[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order())) canonical_order_index = np.argsort(kmer_order) canonical_order = [kmer_order[i] for i in canonical_order_index] if canonical_order != self.canonical_order(): raise ParsingError("[Error] Failed to process tetranucleotide signature file: " + signature_file) for line in f: line_split = line.split('\t') sig[line_split[0]] = [float(line_split[i + 1]) for i in canonical_order_index] return sig except IOError: print '[Error] Failed to open signature file: %s' % signature_file sys.exit() except ParsingError: sys.exit() def write(self, signatures, output_file): """Write tetranucleotide signatures. Parameters ---------- signature_file : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. output_file : str Name of output file. """ #singlegenome.write_links(output_file) fout = open(output_file, 'w') fout.write('Scaffold id') for kmer in self.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for seq_id, tetra_signature in signatures.iteritems(): fout.write(seq_id + '\t') fout.write('\t'.join(map(str, tetra_signature))) fout.write('\n') fout.close()
class Tetranucleotide(object): """Calculate tetranucleotide signature of sequences.""" def __init__(self, cpus=1): """Initialization. Parameters ---------- cpus : int Number of cpus to use. """ self.logger = logging.getLogger('timestamp') self.cpus = cpus self.signatures = GenomicSignature(4) def canonical_order(self): """Canonical order of tetranucleotides.""" return self.signatures.canonical_order() def _producer(self, seq_info): """Calculate tetranucleotide signature of a sequence. Parameters ---------- seq_id : str Unique id of sequence. seq : str Sequence in nuceltoide space. Returns ------- str Unique id of sequence. list Count of each kmer in the canonical order. """ seq_id, seq = seq_info sig = self.signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers return (seq_id, sig) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. Parameters ---------- produced_data : list -> kmers in the canonical order Tetranucleotide signature in canconical order. consumer_data : d[seq_id] -> tetranucleotide signature Set of kmers observed across all genomes (kmer_set), along with the kmer usage of each genome (genome_kmer_usage). Returns ------- consumer_data: dict The consumer data structure or None must be returned """ if consumer_data == None: consumer_data = {} seq_id, sig = produced_data consumer_data[seq_id] = sig return consumer_data def _progress(self, processed_items, total_items): """Report progress of consumer processes. Parameters ---------- processed_items : int Number of sequences processed. total_items : int Total number of sequences to process. Returns ------- str String indicating progress of data processing. """ if self.logger.is_silent: return None else: return ' Finished processing %d of %d (%.2f%%) sequences.' % ( processed_items, total_items, float(processed_items) * 100 / total_items) def run(self, seq_file): """Calculate tetranucleotide signatures of sequences. Parameters ---------- seq_file : str Name of fasta/q file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ self.logger.info( 'Calculating tetranucleotide signature for each sequence:') parallel = Parallel(self.cpus) seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress) return seq_signatures def read(self, signature_file): """Read tetranucleotide signatures. Parameters ---------- signature_file : str Name of file to read. Returns ------- dict : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. """ try: sig = {} with open(signature_file) as f: header = f.readline().split('\t') kmer_order = [x.strip().upper() for x in header[1:]] if len(kmer_order) != len(self.canonical_order()): raise ParsingError( "[Error] Tetranucleotide file must contain exactly %d tetranucleotide columns." % len(self.canonical_order())) canonical_order_index = np.argsort(kmer_order) canonical_order = [ kmer_order[i] for i in canonical_order_index ] if canonical_order != self.canonical_order(): raise ParsingError( "[Error] Failed to process tetranucleotide signature file: " + signature_file) for line in f: line_split = line.split('\t') sig[line_split[0]] = [ float(line_split[i + 1]) for i in canonical_order_index ] return sig except IOError: print '[Error] Failed to open signature file: %s' % signature_file sys.exit() except ParsingError: sys.exit() def write(self, signatures, output_file): """Write tetranucleotide signatures. Parameters ---------- signature_file : d[seq_id] -> tetranucleotide signature in canonical order Count of each kmer. output_file : str Name of output file. """ fout = open(output_file, 'w') fout.write('Scaffold id') for kmer in self.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for seq_id, tetra_signature in signatures.iteritems(): fout.write(seq_id + '\t') fout.write('\t'.join(map(str, tetra_signature))) fout.write('\n') fout.close()
class KmerUsage(object): """Calculate kmer usage over a set of genomes. The implementation for calculating genomic signatures is not optimized for speed. As such, this class is useful for k <= 8. """ def __init__(self, k, cpus=1): """Initialization. Parameters ---------- cpus : int Number of cpus to use. """ self.logger = logging.getLogger('timestamp') self.k = k self.cpus = cpus self.logger.info('Calculating unique kmers of size k = %d.' % self.k) self.signatures = GenomicSignature(self.k) def _producer(self, genome_file): """Calculates kmer usage of a genome. Parameters ---------- genome_file : str Fasta file containing genomic sequences. Returns ------- str Unique identifier of genome. dict : d[kmer] -> count Occurrence of each kmer. """ genome_id = ntpath.basename(genome_file) genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(genome_file) kmer_usage = self.signatures.counts(seqs) return (genome_id, kmer_usage) def _consumer(self, produced_data, consumer_data): """Consume results from producer processes. Parameters ---------- produced_data : list -> [genome_id, kmer_usage] Unique id of a genome followed by a dictionary indicating its kmer usage. Returns ------- consumer_data dictionary indicating the frequency of kmers in each genome """ if consumer_data == None: consumer_data = defaultdict(dict) genome_id, kmer_usage = produced_data for idx, kmer in enumerate(self.signatures.canonical_order()): consumer_data[genome_id][kmer] = kmer_usage[idx] return consumer_data def _progress(self, processed_items, total_items): """Report progress of consumer processes. Parameters ---------- processed_items : int Number of genomes processed. total_items : int Total number of genomes to process. Returns ------- str String indicating progress of data processing. """ return ' Finished processing %d of %d (%.2f%%) genomes.' % (processed_items, total_items, float(processed_items) * 100 / total_items) def run(self, genome_files): """Calculate kmer usage over a set of genomes. Parameters ---------- genome_files : list Fasta files containing genomic sequences in nucleotide space. Returns ------- dict of dict : d[genome_id][kmer] -> count Kmer usage of each genome. set Set with all identified kmers. """ self.logger.info('Calculating kmer usage for each genome.') progress_func = self._progress if self.logger.is_silent: progress_func = None parallel = Parallel(self.cpus) kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func) return kmer_counts, self.signatures.canonical_order()