def test_simple_kadian_2(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1 hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAGCTATG") hi.consume("ACAGCTATCTCTAGAGCTATG") # --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAcCTATG") hi.consume("ACAGCTATCTCTAGACCTATG") # --^ --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 1, x hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCgCTAGAGCTATG") hi.consume("ACTGCTATCGCTAGAGCTATG") # --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x
def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer.CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) ht = khmer.CountingHash(12, sizes) try: ht.load(savepath) except IOError as err: assert 0, 'Should not produce an IOError: ' + str(err) tracking = khmer._Hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer.CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. ht = khmer.CountingHash(12, sizes) ht.load(loadpath) tracking = khmer._Hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_2_kadian(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1 hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTATG") hi.consume("ACTGCTATCTCTAGACCTATG") # ---------------^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTAtG") hi.consume("ACTGCTATCTCTAGACCTATG") # ---------------^---^ assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2 hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTACtcCTAtG") hi.consume("ACTGCTATCTCTACTCCTATG") # --------------^^---^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTgTCTCTACtcCTAtG") hi.consume("ACTGCTGTCTCTACTCCTATG") # ------^-------^^---^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 1, x
def test_very_short_read(): short_filename = utils.get_test_data('test-short.fa') kh = khmer.CountingHash(9, 4, 1) n_reads, n_kmers = kh.consume_fasta(short_filename) assert n_reads == 1, n_reads assert n_kmers == 0, n_kmers kh = khmer.CountingHash(8, 4, 1) n_reads, n_kmers = kh.consume_fasta(short_filename) assert n_reads == 1, n_reads assert n_kmers == 1, n_kmers
def test_get_kmer_hashes(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("AAAAAA") print(hashes) assert len(hashes) == 1 assert hi.get(hashes[0]) == 1 hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("AAAAAA") print(hashes) assert len(hashes) == 1 assert hi.get(hashes[0]) == 2 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 1 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 2 hi.consume("AAAAAT") hashes = hi.get_kmer_hashes("AAAAAAT") print(hashes) assert len(hashes) == 2 assert hi.get(hashes[0]) == 2 assert hi.get(hashes[1]) == 3
def test_consume_absentfasta(): countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_fasta("absent_file.fa") assert 0, "This should fail" except OSError as err: print(str(err))
def test_badconsume_and_tag(): countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_and_tag() assert 0, "this should fail" except TypeError as err: print(str(err))
def test_abund(self): ht = khmer.CountingHash(10, 4**10, 1) filename = utils.get_test_data('test-abund-read.fa') outname = utils.get_temp_filename('test_abund.out') ht.consume_fasta(filename) try: ht.consume_fasta() assert 0, "should fail" except TypeError as err: print(str(err)) try: ht.consume_fasta("nonexistent") assert 0, "should fail" except OSError as err: print(str(err)) ht.output_fasta_kmer_pos_freq(filename, outname) try: ht.output_fasta_kmer_pos_freq() assert 0, "should fail" except TypeError as err: print(str(err)) fd = open(outname, "r") output = fd.readlines() assert len(output) == 1 output = output[0] output = output.strip().split() assert ['1'] * (114 - 10 + 1) == output fd.close()
def test_3_tables(): x = list(PRIMES_1m) x.append(1000005) hi = khmer.CountingHash(12, x) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777 collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603 collision_3 = 'AAACGTATCGAG' assert khmer.forward_hash(collision_3, 12) == 184755 # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) # hash(GG) % 1000005 == hash(collision_3) hi.consume(GG) assert hi.get(GG) == 1 hi.consume(collision_1) assert hi.get(GG) == 1 hi.consume(collision_2) assert hi.get(GG) == 1 hi.consume(collision_3) assert hi.get(GG) == 2
def test_get_raw_tables(): ht = khmer.CountingHash(20, 1e5, 4) tables = ht.get_raw_tables() for size, table in zip(ht.hashsizes(), tables): assert isinstance(table, memoryview) assert size == len(table)
def test_find_spectral_error_positions_4(): hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) posns = hi.find_spectral_error_positions(DNA, 2) assert posns == [], posns
def test_badhashsizes(): countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.hashsizes(True) assert 0, "this should fail" except TypeError as err: print(str(err))
def test_get_kmer_counts(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") counts = hi.get_kmer_counts("AAAAAA") print(counts) assert len(counts) == 1 assert counts[0] == 1 hi.consume("AAAAAA") counts = hi.get_kmer_counts("AAAAAA") print(counts) assert len(counts) == 1 assert counts[0] == 2 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 1 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 2 hi.consume("AAAAAT") counts = hi.get_kmer_counts("AAAAAAT") print(counts) assert len(counts) == 2 assert counts[0] == 2 assert counts[1] == 3
def test_partition_overlap_2(): kh = khmer.CountingHash(20, 1e4, 4) for i in range(10): kh.consume_and_tag(a) for i in range(5): kh.consume_and_tag(b) # this will get paths only in 'a' p1 = kh.do_subset_partition_with_abundance(10, 50) # this will get paths only in 'b' p2 = kh.do_subset_partition_with_abundance(5, 10) # p1.report_on_partitions() # p2.report_on_partitions() x = p1.compare_partitions(3, p2, 3) assert x == (8, 6, 0), x x = p1.compare_partitions(3, p2, 5) assert x == (2, 0, 6), x x = p1.partition_sizes() assert x == ([(3, 8)], 0), x x = p2.partition_sizes() assert x == ([(3, 6), (5, 6)], 2), x x = p1.partition_average_coverages(kh) assert x == [(3, 11)] x = p2.partition_average_coverages(kh) assert x == [(3, 5), (5, 10)], x
def test_median_at_least_exception(): ht = khmer.CountingHash(20, 1e6, 2) try: ht.median_at_least('ATGGCTGATCGAT', 1) assert 0, "should have thrown ValueError" except ValueError as e: pass
def test_64bitshift(): kh = khmer.CountingHash(25, 4, 1) fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG" substr = "ATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGC" kh.consume(fullstr) assert 0 < kh.get_min_count(substr), kh.get_min_count(substr)
def test_counting_load_bigcount(): count_table = khmer.CountingHash(10, 1e5, 4) count_table.set_use_bigcount(True) for i in range(500): print(i, count_table.count('ATATATATAT')) count = count_table.get('ATATATATAT') assert count == 500
def main(): parser = build_construct_args() parser.add_argument('-l', '--lower-cutoff', type=int, dest='lower_cutoff', default=DEFAULT_LOWER_CUTOFF) parser.add_argument('-u', '--upper-cutoff', type=int, dest='upper_cutoff', default=DEFAULT_UPPER_CUTOFF) parser.add_argument('output_filename') parser.add_argument('input_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes " \ "(n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes output = args.output_filename input = args.input_filename print('lower cutoff:', args.lower_cutoff) print('upper cutoff:', args.upper_cutoff) print('Saving stoptags to %s' % output) print('Loading sequences in %s' % input) ### print('making hashtable') ht = khmer.CountingHash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) print('consuming input', input) hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff, args.upper_cutoff) print('saving stoptags', output) hb.save_stop_tags(output)
def test_bigcount_overflow(): kh = khmer.CountingHash(18, 1e7, 4) kh.set_use_bigcount(True) for i in range(0, 70000): kh.count('GGTTGACGGGGCTCAGGG') assert kh.get('GGTTGACGGGGCTCAGGG') == MAX_BIGCOUNT
def test_trim_full(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume(DNA) hi.consume(DNA) seq, pos = hi.trim_on_abundance(DNA, 2) assert DNA == seq, seq
def test_consume_fasta_and_tag(): countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_fasta_and_tag() assert 0, "this should fail" except TypeError as err: print(str(err)) countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
def test_get_raw_tables_view(): ht = khmer.CountingHash(20, 1e5, 4) tables = ht.get_raw_tables() for tab in tables: assert sum(tab.tolist()) == 0 ht.consume('AAAATTTTCCCCGGGGAAAA') for tab in tables: assert sum(tab.tolist()) == 1
def test_get_kmers(): hi = khmer.CountingHash(6, 1e6, 2) kmers = hi.get_kmers("AAAAAA") assert kmers == ["AAAAAA"] kmers = hi.get_kmers("AAAAAAT") assert kmers == ["AAAAAA", "AAAAAT"]
def test_64bitshift_2(): kh = khmer.CountingHash(25, 4, 1) fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG" kh.consume(fullstr) for i in range(len(fullstr) - 25 + 1): substr = fullstr[i:i + 25] assert kh.get(substr) > 0
def test_load_gz_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.CountingHash(12, 1000, 2) try: hi.load(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_median_too_short(): hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") try: hi.get_median_count("A") assert 0, "this should fail" except ValueError: pass
def test_bad_use_bigcount(): countingtable = khmer.CountingHash(4, 4 ** 4, 4) countingtable.set_use_bigcount(True) assert countingtable.get_use_bigcount() try: countingtable.get_use_bigcount(True) assert 0, "this should fail" except TypeError as err: print(str(err))
def test_find_spectral_error_positions_5(): hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:10]) hi.consume(DNA[11:]) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [10], posns
def create_countgraph(args, ksize=None, multiplier=1.0): if ksize is None: ksize = args.ksize if ksize > 32: print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n") sys.exit(1) tablesize = _calculate_tablesize(args, 'countgraph', multiplier=multiplier) return khmer.CountingHash(ksize, tablesize, args.n_tables)