Example #1
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._CountingHash(12, sizes)
    try:
        ht.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Example #2
0
def test_hashtable_n_entries():
    countingtable = khmer._CountingHash(4, [5])
    try:
        countingtable.n_entries("nope")
        assert 0, "n_entries should accept no arguments"
    except TypeError as err:
        print(str(err))
Example #3
0
def test_3_tables():
    x = list(PRIMES_1m)
    x.append(1000005)

    hi = khmer._CountingHash(12, x)

    GG = 'G' * 12                   # forward_hash: 11184810
    assert khmer.forward_hash(GG, 12) == 11184810

    collision_1 = 'AAACGTATGACT'
    assert khmer.forward_hash(collision_1, 12) == 184777

    collision_2 = 'AAATACCGAGCG'
    assert khmer.forward_hash(collision_2, 12) == 76603

    collision_3 = 'AAACGTATCGAG'
    assert khmer.forward_hash(collision_3, 12) == 184755

    # hash(GG) % 1000003 == hash(collision_1)
    # hash(GG) % 1009837 == hash(collision_2)
    # hash(GG) % 1000005 == hash(collision_3)
    hi.consume(GG)
    assert hi.get(GG) == 1

    hi.consume(collision_1)
    assert hi.get(GG) == 1

    hi.consume(collision_2)
    assert hi.get(GG) == 1

    hi.consume(collision_3)
    assert hi.get(GG) == 2
def test_hashtable_n_entries():
    countingtable = khmer._CountingHash(4, [5])
    try:
        countingtable.n_entries("nope")
        assert 0, "n_entries should accept no arguments"
    except TypeError as err:
        print(str(err))
def test_no_collision():
    kh = khmer._CountingHash(4, [5])

    kh.count('AAAA')
    assert kh.get('AAAA') == 1

    kh.count('TTTT')                    # reverse complement
    assert kh.get('TTTT') == 2
def test_collision():
    kh = khmer._CountingHash(4, [5])

    kh.count('AAAA')
    assert kh.get('AAAA') == 1

    kh.count('TTTT')
    assert kh.get('TTTT') == 2
Example #7
0
def test_no_collision():
    kh = khmer._CountingHash(4, [5])

    kh.count('AAAA')
    assert kh.get('AAAA') == 1

    kh.count('TTTT')  # reverse complement
    assert kh.get('TTTT') == 2
Example #8
0
def test_collision():
    kh = khmer._CountingHash(4, [5])

    kh.count('AAAA')
    assert kh.get('AAAA') == 1

    kh.count('TTTT')
    assert kh.get('TTTT') == 2
def test_maxcount_consume():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer._CountingHash(4, [5])

    s = "A" * 10000
    kh.consume(s)

    c = kh.get('AAAA')
    assert c == MAX_COUNT, c    # this will depend on HashcountType...
def test_consume_uniqify_first():
    kh = khmer._CountingHash(4, [5])

    s = "TTTT"
    s_rc = "AAAA"

    kh.consume(s)
    n = kh.get(s_rc)
    assert n == 1
Example #11
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('temphashbitssave0.ht')

    hi = khmer._CountingHash(12, [1])
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
Example #12
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('temphashbitssave0.htable')

    hi = khmer._CountingHash(12, [1])
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
Example #13
0
def test_consume_uniqify_first():
    kh = khmer._CountingHash(4, [5])

    s = "TTTT"
    s_rc = "AAAA"

    kh.consume(s)
    n = kh.get(s_rc)
    assert n == 1
Example #14
0
def test_maxcount_consume():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer._CountingHash(4, [5])

    s = "A" * 10000
    kh.consume(s)

    c = kh.get('AAAA')
    assert c == MAX_COUNT, c  # this will depend on HashcountType...
Example #15
0
def test_maxcount_consume_with_bigcount():
    # use the bigcount hack to avoid saturating the hashtable.
    kh = khmer._CountingHash(4, [5])
    kh.set_use_bigcount(True)

    s = "A" * 10000
    kh.consume(s)

    c = kh.get('AAAA')
    assert c == 10000 - 3, c
def test_maxcount_consume_with_bigcount():
    # use the bigcount hack to avoid saturating the hashtable.
    kh = khmer._CountingHash(4, [5])
    kh.set_use_bigcount(True)

    s = "A" * 10000
    kh.consume(s)

    c = kh.get('AAAA')
    assert c == 10000 - 3, c
def test_get_maxcount():
    kh = khmer._CountingHash(4, [7])

    s = "AAAAACGT"
    kh.consume(s)

    x = kh.get_max_count(s)
    assert x == 2

    kh.consume(s)
    x = kh.get_max_count(s)
    assert x == 4
def test_get_mincount():
    kh = khmer._CountingHash(4, [5])

    s = "AAAAACGT"
    kh.consume(s)

    x = kh.get_min_count(s)
    assert x == 1, x

    kh.consume(s)
    x = kh.get_min_count(s)
    assert x == 2, x
Example #19
0
def test_badcount():
    countingtable = khmer._CountingHash(4, [5])
    try:
        countingtable.count()
        assert 0, "count should require one argument"
    except TypeError as err:
        print(str(err))
    try:
        countingtable.count('ABCDE')
        assert 0, "count should require k-mer size to be equal"
    except ValueError as err:
        print(str(err))
Example #20
0
def test_get_maxcount():
    kh = khmer._CountingHash(4, [7])

    s = "AAAAACGT"
    kh.consume(s)

    x = kh.get_max_count(s)
    assert x == 2

    kh.consume(s)
    x = kh.get_max_count(s)
    assert x == 4
Example #21
0
def test_hashbits_file_type_check():
    kh = khmer._CountingHash(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    htable = khmer._Hashbits(12, [1])

    try:
        htable.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Example #22
0
def test_hashbits_file_type_check():
    kh = khmer._CountingHash(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    ht = khmer._Hashbits(12, [1])

    try:
        ht.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Example #23
0
def test_get_mincount():
    kh = khmer._CountingHash(4, [5])

    s = "AAAAACGT"
    kh.consume(s)

    x = kh.get_min_count(s)
    assert x == 1, x

    kh.consume(s)
    x = kh.get_min_count(s)
    assert x == 2, x
def test_badcount():
    countingtable = khmer._CountingHash(4, [5])
    try:
        countingtable.count()
        assert 0, "count should require one argument"
    except TypeError as err:
        print(str(err))
    try:
        countingtable.count('ABCDE')
        assert 0, "count should require k-mer size to be equal"
    except ValueError as err:
        print(str(err))
Example #25
0
def test_get_maxcount_rc():
    kh = khmer._CountingHash(4, [7])

    s = "AAAAACGT"
    src = "ACGTTTTT"
    kh.consume(s)

    x = kh.get_max_count(s)
    assert x == 2, x

    kh.consume(src)
    x = kh.get_max_count(s)
    assert x == 4, x
def test_get_maxcount_rc():
    kh = khmer._CountingHash(4, [7])

    s = "AAAAACGT"
    src = "ACGTTTTT"
    kh.consume(s)

    x = kh.get_max_count(s)
    assert x == 2, x

    kh.consume(src)
    x = kh.get_max_count(s)
    assert x == 4, x
Example #27
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._CountingHash(12, sizes)
    try:
        ht.load(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Example #28
0
    def do_test(ctfile):
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000)

        orig = khmer._CountingHash(12, sizes)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_counting_hash(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3966, orig_count
        assert loaded_count == orig_count, loaded_count
Example #29
0
def test_maxcount():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer._CountingHash(4, [5])

    last_count = None
    for _ in range(0, 10000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        print(last_count, c)
        if c == last_count:
            break
        last_count = c

    assert c != 10000, "should not be able to count to 10000"
    assert c == MAX_COUNT  # this will depend on HashcountType...
def test_maxcount():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer._CountingHash(4, [5])

    last_count = None
    for _ in range(0, 10000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        print(last_count, c)
        if c == last_count:
            break
        last_count = c

    assert c != 10000, "should not be able to count to 10000"
    assert c == MAX_COUNT       # this will depend on HashcountType...
Example #31
0
def test_maxcount_with_bigcount():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer._CountingHash(4, [5])
    kh.set_use_bigcount(True)

    last_count = None
    for _ in range(0, 10000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        print(last_count, c)
        if c == last_count:
            break
        last_count = c

    assert c == 10000, "should be able to count to 10000"
    assert c != MAX_COUNT
def test_maxcount_with_bigcount():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer._CountingHash(4, [5])
    kh.set_use_bigcount(True)

    last_count = None
    for _ in range(0, 10000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        print(last_count, c)
        if c == last_count:
            break
        last_count = c

    assert c == 10000, "should be able to count to 10000"
    assert c != MAX_COUNT
def test_complete_2_collision():
    kh = khmer._CountingHash(4, [5])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Example #34
0
def test_complete_2_collision():
    kh = khmer._CountingHash(4, [5])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Example #35
0
def test_fakelump_load_stop_tags_trunc():
    fakelump_fa = utils.get_test_data('fakelump.fa')
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    ht = khmer.Hashbits(32, 1e5, 4)
    ht.consume_fasta_and_tag(fakelump_fa)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    (n_partitions, n_singletons) = ht.count_partitions()
    assert n_partitions == 1, n_partitions

    # now, break partitions on any k-mer that you see more than once
    # on big excursions, where big excursions are excursions 40 out
    # that encounter more than 82 k-mers.  This should specifically
    # identify our connected sequences in fakelump...

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer._CountingHash(32, [5, 7, 11, 13])

    ht.repartition_largest_partition(None, counting,
                                     EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    ht.save_stop_tags(fakelump_fa_foo)
    data = open(fakelump_fa_foo, 'rb').read()

    fp = open(fakelump_fa_foo, 'wb')
    fp.write(data[:10])
    fp.close()

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Hashbits(32, [5, 7, 11, 13])
    ht.consume_fasta_and_tag(fakelump_fa)

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except IOError:
        pass
Example #36
0
def test_fakelump_load_stop_tags_trunc():
    fakelump_fa = utils.get_test_data('fakelump.fa')
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    ht = khmer.Hashbits(32, 1e5, 4)
    ht.consume_fasta_and_tag(fakelump_fa)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    (n_partitions, n_singletons) = ht.count_partitions()
    assert n_partitions == 1, n_partitions

    # now, break partitions on any k-mer that you see more than once
    # on big excursions, where big excursions are excursions 40 out
    # that encounter more than 82 k-mers.  This should specifically
    # identify our connected sequences in fakelump...

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer._CountingHash(32, [5, 7, 11, 13])

    ht.repartition_largest_partition(None, counting,
                                     EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    ht.save_stop_tags(fakelump_fa_foo)
    data = open(fakelump_fa_foo, 'rb').read()

    fp = open(fakelump_fa_foo, 'wb')
    fp.write(data[:10])
    fp.close()

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Hashbits(32, [5, 7, 11, 13])
    ht.consume_fasta_and_tag(fakelump_fa)

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
Example #37
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    sizes = khmer.get_n_primes_near_x(3, 200)

    hi = khmer._CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            ht = khmer.load_counting_hash(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Example #38
0
def test_load_gz_truncated_should_fail():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.CountingHash(12, 1000, 2)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    fp = open(savepath, 'rb')
    data = fp.read()
    fp.close()

    fp = open(savepath, 'wb')
    fp.write(data[:1000])
    fp.close()

    hi = khmer._CountingHash(12, [1])
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Example #39
0
def test_load_truncated_should_fail():
    inpath = utils.get_test_data("random-20-a.fa")
    savepath = utils.get_temp_filename("temphashbitssave0.ct")

    hi = khmer.CountingHash(12, 1000, 2)

    hi.consume_fasta(inpath)
    hi.save(savepath)

    fp = open(savepath, "rb")
    data = fp.read()
    fp.close()

    fp = open(savepath, "wb")
    fp.write(data[:1000])
    fp.close()

    hi = khmer._CountingHash(12, [1])
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Example #40
0
def test_complete_no_collision():
    kh = khmer._CountingHash(4, [4**4])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:  # palindromes are singular
            n_palindromes += 1
        if kh.get(i):  # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == kh.n_entries(), n_rc_filled
    assert n_palindromes == 16, n_palindromes
    assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, \
        n_fwd_filled
def test_complete_no_collision():
    kh = khmer._CountingHash(4, [4 ** 4])

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0

    for i in range(0, kh.n_entries()):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:              # palindromes are singular
            n_palindromes += 1
        if kh.get(i):                   # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == kh.n_entries(), n_rc_filled
    assert n_palindromes == 16, n_palindromes
    assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, \
        n_fwd_filled
Example #42
0
 def setup(self):
     self.hi = khmer._CountingHash(12, PRIMES_1m)
 def setup(self):
     self.kh = khmer._CountingHash(4, [4 ** 4])
Example #44
0
 def setup(self):
     self.kh = khmer._CountingHash(4, [5])
     A_filename = utils.get_test_data('all-A.fa')
     self.kh.consume_fasta(A_filename)
 def setup(self):
     self.kh = khmer._CountingHash(4, [5])
     A_filename = utils.get_test_data('all-A.fa')
     self.kh.consume_fasta(A_filename)
Example #46
0
 def setup(self):
     self.kh = khmer._CountingHash(4, [4**4])
Example #47
0
def test_counting_bad_primes_list():
    try:
        ht = khmer._CountingHash(12, ["a", "b", "c"], 1)
        assert 0, "bad list of primes should fail"
    except TypeError as e:
        print(str(e))