def test_four_kmers_all(self): k = Kmers('AAAAATTTTTTTT', 4) self.assertEqual(k.get_all_kmers_freq(max_kmer_count=5), { 'AAAA': 2, 'AAAT': 1, 'AATT': 1, 'ATTT': 1, 'TTTT': 4 })
def sequence_kmers_vals(self): seq_counter = 0 kmer_to_sequences = {} for record in SeqIO.parse(self.filename, "fasta"): sequence_length = len(record.seq) if self.divisible_by_3 and sequence_length % 3 != 0: self.logger.warning( "Excluding gene as it is not divisible by 3:" + record.id) continue kmers = Kmers(str(record.seq), self.k) # We assume here that the sequence name is unique in the FASTA file kmer_to_sequences[record.id] = kmers.get_all_kmers_freq( max_kmer_count=self.max_kmers) seq_counter += 1 return kmer_to_sequences