def test_count_kmers_k_greater_than_text_length(self): """ Test if calling count_kmers with k > len(self.text) raises ValueError :return: None """ with pytest.raises(ValueError): kmers.count_kmers("abc", 4)
def test_count_kmers_k_not_greater_than_0(self): """ Test if calling count_kmers with k <= 0 raises ValueError :return: None """ with pytest.raises(ValueError): kmers.count_kmers("abc", -1)
def test_count_kmers_empty_text(self): """ Test if calling count_kmers with empty text raises ValueError :return: None """ with pytest.raises(ValueError): kmers.count_kmers("", 1)
def test_count_kmers_null_text(self): """ Test if calling count_kmers with null text raises ValueError :return: None """ with pytest.raises(ValueError): kmers.count_kmers(None, 1)
def test_count_7mers(): seq = 'ACGTCGACCGCGTTA' counts = count_kmers(7, seq) assert counts.sum() == (len(seq)-6)*2 counts_rc = count_kmers(7, reverse_complement(seq)) assert (counts == counts_rc).all() map_to_index = defaultdict(lambda: -1) map_to_index.update({ 'A': 0, 'C': 1, 'G': 2, 'T': 3 }) seqi_fw = np.array([map_to_index[c] for c in seq]) map_to_index.update({ 'A': 3, 'C': 2, 'G': 1, 'T': 0 }) seqi_rc = np.array([map_to_index[c] for c in seq[::-1]]) for seqi in [seqi_fw, seqi_rc]: for i in range(6, len(seqi)): index = (((((seqi[i]*4+seqi[i-1])*4+seqi[i-2])*4+seqi[i-3])*4+seqi[i-4])*4+seqi[i-5])*4+seqi[i-6] counts[index] -= 1 assert counts.sum() == 0
def test_count_kmers_valid(self): """ Test if calling count_kmers with valid k returns right result :return: None """ assert kmers.count_kmers("ATTTGGATT", 4) == 6
def test_non_acgt(): count_kmers(1, 'abcdefghijklmnopqrstvwuxyz')
def test_kmers_lookup(): seq = 'ACGTCGACCGCGTTA' * 100 counts = count_kmers(2, seq) lookup = create_kmers_lookup(2, seq) lcounts = count_kmers_lookup(lookup, 0, len(seq)) assert (counts == lcounts).all()
#!/usr/bin/env python from pyfasta import Fasta from tsh.bio import regions_reader import sys from kmers import count_kmers, all_kmers if __name__ == '__main__': genome = Fasta(sys.argv[1]) k = int(sys.argv[2]) kmers = 0 for region in regions_reader(sys.argv[3]): seq = genome[region.chrom][region.start:region.stop] seq = str(seq).upper() for s in seq.split('N'): kmers += count_kmers(k, s) for kmer, count in zip(all_kmers(k), kmers): print '%s\t%d' % (kmer, count)