def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) ht = khmer._Countgraph(12, sizes) try: ht.load(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_3_tables(): x = list(PRIMES_1m) x.append(1000005) hi = khmer._Countgraph(12, x) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777 collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603 collision_3 = 'AAACGTATCGAG' assert khmer.forward_hash(collision_3, 12) == 184755 # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) # hash(GG) % 1000005 == hash(collision_3) hi.consume(GG) assert hi.get(GG) == 1 hi.consume(collision_1) assert hi.get(GG) == 1 hi.consume(collision_2) assert hi.get(GG) == 1 hi.consume(collision_3) assert hi.get(GG) == 2
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = khmer.load_countgraph(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = khmer.load_countgraph(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_complete_no_collision(): kh = khmer._Countgraph(4, [4 ** 4]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == n_entries, n_rc_filled assert n_palindromes == 16, n_palindromes assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \ (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
def test_complete_no_collision(): kh = khmer._Countgraph(4, [4**4]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == n_entries, n_rc_filled assert n_palindromes == 16, n_palindromes assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \ (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
def test_collision(): kh = khmer._Countgraph(4, [5]) kh.count('AAAA') assert kh.get('AAAA') == 1 kh.count('TTTT') assert kh.get('TTTT') == 2
def test_collision(): kh = khmer._Countgraph(4, [5]) kh.count("AAAA") assert kh.get("AAAA") == 1 kh.count("TTTT") assert kh.get("TTTT") == 2
def test_maxcount_consume(): # hashtable should saturate at some point so as not to overflow counter kh = khmer._Countgraph(4, [5]) s = "A" * 10000 kh.consume(s) c = kh.get('AAAA') assert c == MAX_COUNT, c # this will depend on HashcountType...
def test_consume_uniqify_first(): kh = khmer._Countgraph(4, [5]) s = "TTTT" s_rc = "AAAA" kh.consume(s) n = kh.get(s_rc) assert n == 1
def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('tempnodegraphsave0.htable') hi = khmer._Countgraph(12, [1]) try: hi.load(savepath) assert 0, "load should fail" except OSError: pass
def test_maxcount_consume_with_bigcount(): # use the bigcount hack to avoid saturating the hashtable. kh = khmer._Countgraph(4, [5]) kh.set_use_bigcount(True) s = "A" * 10000 kh.consume(s) c = kh.get('AAAA') assert c == 10000 - 3, c
def test_nodegraph_file_type_check(): kh = khmer._Countgraph(12, [1]) savepath = utils.get_temp_filename('tempcountingsave0.ct') kh.save(savepath) try: nodegraph = khmer.load_nodegraph(savepath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_get_mincount(): kh = khmer._Countgraph(4, [5]) s = "AAAAACGT" kh.consume(s) x = kh.get_min_count(s) assert x == 1, x kh.consume(s) x = kh.get_min_count(s) assert x == 2, x
def test_badcount(): countgraph = khmer._Countgraph(4, [5]) try: countgraph.count() assert 0, "count should require one argument" except TypeError as err: print(str(err)) try: countgraph.count('ABCDE') assert 0, "count should require k-mer size to be equal" except ValueError as err: print(str(err))
def test_get_maxcount(): kh = khmer._Countgraph(4, [7]) s = "AAAAACGT" kh.consume(s) x = kh.get_max_count(s) assert x == 2 kh.consume(s) x = kh.get_max_count(s) assert x == 4
def test_get_maxcount_rc(): kh = khmer._Countgraph(4, [7]) s = "AAAAACGT" src = "ACGTTTTT" kh.consume(s) x = kh.get_max_count(s) assert x == 2, x kh.consume(src) x = kh.get_max_count(s) assert x == 4, x
def test_count_2(): hi = khmer._Countgraph(12, PRIMES_1m) kmer = 'G' * 12 hashval = hi.hash('G' * 12) assert hi.get(kmer) == 0 assert hi.get(hashval) == 0 hi.count(kmer) assert hi.get(kmer) == 1 assert hi.get(hashval) == 1 hi.count(hashval) # count hashes same as strings assert hi.get(kmer) == 2 assert hi.get(hashval) == 2
def test_maxcount(): # hashtable should saturate at some point so as not to overflow counter kh = khmer._Countgraph(4, [5]) last_count = None for _ in range(0, 10000): kh.count('AAAA') c = kh.get('AAAA') print(last_count, c) if c == last_count: break last_count = c assert c != 10000, "should not be able to count to 10000" assert c == MAX_COUNT # this will depend on HashcountType...
def test_save_load_large(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_seqfile(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def test_maxcount_with_bigcount(): # hashtable should not saturate, if use_bigcount is set. kh = khmer._Countgraph(4, [5]) kh.set_use_bigcount(True) last_count = None for _ in range(0, 10000): kh.count('AAAA') c = kh.get('AAAA') print(last_count, c) if c == last_count: break last_count = c assert c == 10000, "should be able to count to 10000" assert c != MAX_COUNT
def test_fakelump_load_stop_tags_trunc(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') ht = khmer.Nodegraph(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) (n_partitions, n_singletons) = ht.count_partitions() assert n_partitions == 1, n_partitions # now, break partitions on any k-mer that you see more than once # on big excursions, where big excursions are excursions 40 out # that encounter more than 82 k-mers. This should specifically # identify our connected sequences in fakelump... EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer._Countgraph(32, [5, 7, 11, 13]) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) ht.save_stop_tags(fakelump_fa_foo) data = open(fakelump_fa_foo, 'rb').read() fp = open(fakelump_fa_foo, 'wb') fp.write(data[:10]) fp.close() # ok, now try loading these stop tags; should fail. ht = khmer._Nodegraph(32, [5, 7, 11, 13]) ht.consume_fasta_and_tag(fakelump_fa) try: ht.load_stop_tags(fakelump_fa_foo) assert 0, "this test should fail" except OSError: pass
def test_fakelump_load_stop_tags_trunc(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') ht = khmer.Nodegraph(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) (n_partitions, _) = ht.count_partitions() assert n_partitions == 1, n_partitions # now, break partitions on any k-mer that you see more than once # on big excursions, where big excursions are excursions 40 out # that encounter more than 82 k-mers. This should specifically # identify our connected sequences in fakelump... EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer._Countgraph(32, [5, 7, 11, 13]) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) ht.save_stop_tags(fakelump_fa_foo) data = open(fakelump_fa_foo, 'rb').read() fp = open(fakelump_fa_foo, 'wb') fp.write(data[:10]) fp.close() # ok, now try loading these stop tags; should fail. ht = khmer._Nodegraph(32, [5, 7, 11, 13]) ht.consume_fasta_and_tag(fakelump_fa) try: ht.load_stop_tags(fakelump_fa_foo) assert 0, "this test should fail" except OSError: pass
def test_complete_2_collision(): kh = khmer._Countgraph(4, [5]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_rc_filled = 0 # n_fwd_filled = 0 for i in range(0, 128): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 # if kh.get(i): # int hashing is not rc aware # n_fwd_filled += 1 assert n_rc_filled == 128, n_rc_filled
def test_count_1(): hi = khmer._Countgraph(12, PRIMES_1m) kmer = 'G' * 12 hashval = hi.hash('G' * 12) assert hi.get(kmer) == 0 assert hi.get(hashval) == 0 hi.count(kmer) assert hi.get(kmer) == 1 assert hi.get(hashval) == 1 hi.count(kmer) assert hi.get(kmer) == 2 assert hi.get(hashval) == 2 kmer = 'G' * 11 with pytest.raises(ValueError): hi.hash(kmer)
def test_load_gz_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.Countgraph(12, 1000, 2) hi.consume_fasta(inpath) hi.save(savepath) fp = open(savepath, 'rb') data = fp.read() fp.close() fp = open(savepath, 'wb') fp.write(data[:1000]) fp.close() hi = khmer._Countgraph(12, [1]) try: hi.load(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def test_count_1(): hi = khmer._Countgraph(12, PRIMES_1m) kmer = 'G' * 12 hashval = hi.hash('G' * 12) assert hi.get(kmer) == 0 assert hi.get(hashval) == 0 hi.count(kmer) assert hi.get(kmer) == 1 assert hi.get(hashval) == 1 hi.count(kmer) assert hi.get(kmer) == 2 assert hi.get(hashval) == 2 kmer = 'G' * 11 try: hi.hash(kmer) assert 0, "incorrect kmer size should fail" except RuntimeError: pass
def setup(self): self.kh = khmer._Countgraph(4, [4 ** 4])
def setup(self): self.kh = khmer._Countgraph(4, [4**4])
def test_counting_bad_primes_list(): try: khmer._Countgraph(12, ["a", "b", "c"], 1) assert 0, "bad list of primes should fail" except TypeError as e: print(str(e))
def test_bad_create(): try: countgraph = khmer._Countgraph(5, []) except ValueError as err: assert 'tablesizes needs to be one or more numbers' in str(err)
def setup(self): self.hi = khmer._Countgraph(12, PRIMES_1m)
def test_revhash_1(): hi = khmer._Countgraph(12, [1]) kmer = 'C' * 12 hashval = hi.hash('C' * 12) assert hi.reverse_hash(hashval) == kmer
def setup(self): self.kh = khmer._Countgraph(4, [5]) A_filename = utils.get_test_data('all-A.fa') self.kh.consume_fasta(A_filename)