Example #1
0
 def test_count_kmers_k_greater_than_text_length(self):
     """
     Test if calling count_kmers with k > len(self.text)
     raises ValueError
     :return: None
     """
     with pytest.raises(ValueError):
         kmers.count_kmers("abc", 4)
Example #2
0
 def test_count_kmers_k_not_greater_than_0(self):
     """
     Test if calling count_kmers with k <= 0
     raises ValueError
     :return: None
     """
     with pytest.raises(ValueError):
         kmers.count_kmers("abc", -1)
Example #3
0
 def test_count_kmers_empty_text(self):
     """
     Test if calling count_kmers with empty text
     raises ValueError
     :return: None
     """
     with pytest.raises(ValueError):
         kmers.count_kmers("", 1)
Example #4
0
 def test_count_kmers_null_text(self):
     """
     Test if calling count_kmers with null text
     raises ValueError
     :return: None
     """
     with pytest.raises(ValueError):
         kmers.count_kmers(None, 1)
Example #5
0
def test_count_7mers():
    seq = 'ACGTCGACCGCGTTA'
    counts = count_kmers(7, seq)
    assert counts.sum() == (len(seq)-6)*2
    counts_rc = count_kmers(7, reverse_complement(seq))
    assert (counts == counts_rc).all()
    map_to_index = defaultdict(lambda: -1)
    map_to_index.update({ 'A': 0, 'C': 1, 'G': 2, 'T': 3 })
    seqi_fw = np.array([map_to_index[c] for c in seq])
    map_to_index.update({ 'A': 3, 'C': 2, 'G': 1, 'T': 0 })
    seqi_rc = np.array([map_to_index[c] for c in seq[::-1]])
    for seqi in [seqi_fw, seqi_rc]:
        for i in range(6, len(seqi)):
            index = (((((seqi[i]*4+seqi[i-1])*4+seqi[i-2])*4+seqi[i-3])*4+seqi[i-4])*4+seqi[i-5])*4+seqi[i-6]
            counts[index] -= 1
    assert counts.sum() == 0
Example #6
0
 def test_count_kmers_valid(self):
     """
     Test if calling count_kmers with valid k
     returns right result
     :return: None
     """
     assert kmers.count_kmers("ATTTGGATT", 4) == 6
Example #7
0
def test_non_acgt():
    count_kmers(1, 'abcdefghijklmnopqrstvwuxyz')
Example #8
0
def test_kmers_lookup():
    seq = 'ACGTCGACCGCGTTA' * 100
    counts = count_kmers(2, seq)
    lookup = create_kmers_lookup(2, seq)
    lcounts = count_kmers_lookup(lookup, 0, len(seq))
    assert (counts == lcounts).all()
Example #9
0
#!/usr/bin/env python

from pyfasta import Fasta
from tsh.bio import regions_reader
import sys
from kmers import count_kmers, all_kmers

if __name__ == '__main__':
    genome = Fasta(sys.argv[1])
    k = int(sys.argv[2])
    kmers = 0
    for region in regions_reader(sys.argv[3]):
        seq = genome[region.chrom][region.start:region.stop]
        seq = str(seq).upper()
        for s in seq.split('N'):
            kmers += count_kmers(k, s)
    for kmer, count in zip(all_kmers(k), kmers):
        print '%s\t%d' % (kmer, count)