def test_random_20_a_succ_IV_save(self): ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') savefile_tags = utils.get_temp_filename('tags') outfile = filename + utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht) ht.save_tagset(savefile_tags) del ht ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) ht.load(savefile_ht) ht.load_tagset(savefile_tags) divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def test_update_from(): htableable = khmer.Hashbits(5, 1000, 4) other_htableable = khmer.Hashbits(5, 1000, 4) assert htableable.get('AAAAA') == 0 assert htableable.get('GCGCG') == 0 assert other_htableable.get('AAAAA') == 0 assert other_htableable.get('GCGCG') == 0 other_htableable.count('AAAAA') assert htableable.get('AAAAA') == 0 assert htableable.get('GCGCG') == 0 assert other_htableable.get('AAAAA') == 1 assert other_htableable.get('GCGCG') == 0 htableable.count('GCGCG') assert htableable.get('AAAAA') == 0 assert htableable.get('GCGCG') == 1 assert other_htableable.get('AAAAA') == 1 assert other_htableable.get('GCGCG') == 0 htableable.update(other_htableable) assert htableable.get('AAAAA') == 1 assert htableable.get('GCGCG') == 1 assert other_htableable.get('AAAAA') == 1 assert other_htableable.get('GCGCG') == 0
def test_update_from(): ht = khmer.Hashbits(5, 1000, 4) ht2 = khmer.Hashbits(5, 1000, 4) assert ht.get('AAAAA') == 0 assert ht.get('GCGCG') == 0 assert ht2.get('AAAAA') == 0 assert ht2.get('GCGCG') == 0 ht2.count('AAAAA') assert ht.get('AAAAA') == 0 assert ht.get('GCGCG') == 0 assert ht2.get('AAAAA') == 1 assert ht2.get('GCGCG') == 0 ht.count('GCGCG') assert ht.get('AAAAA') == 0 assert ht.get('GCGCG') == 1 assert ht2.get('AAAAA') == 1 assert ht2.get('GCGCG') == 0 ht.update(ht2) assert ht.get('AAAAA') == 1 assert ht.get('GCGCG') == 1 assert ht2.get('AAAAA') == 1 assert ht2.get('GCGCG') == 0
def test_bloom_c_2(): # simple one K = 4 HT_SIZE = 10 # use 11 N_HT1 = 1 # hashtable size = 11 N_HT2 = 2 # hashtable size = 11,13 # use only 1 hashtable, no bloom filter ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1) ht1.count('AAAA') # 00 00 00 00 = 0 ht1.count('ACTG') # 00 10 01 11 = assert ht1.n_unique_kmers() == 2 ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer assert ht1.n_unique_kmers() == 2 ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer assert ht1.n_unique_kmers() == 2 # use two hashtables with 11,13 ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2) ht2.count('AAAA') # 00 00 00 00 = 0 ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 assert ht2.n_unique_kmers() == 2 ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer assert ht2.n_unique_kmers() == 3 ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50 # collision with both 2nd and 3rd kmers assert ht2.n_unique_kmers() == 3
def test_update_from_diff_tablesize(): ht = khmer.Hashbits(5, 100, 4) ht2 = khmer.Hashbits(5, 1000, 4) try: ht.update(ht2) assert 0, "should not be reached" except ValueError as err: print(str(err))
def test_update_from_diff_num_tables(): htableable = khmer.Hashbits(5, 1000, 3) other_htableable = khmer.Hashbits(5, 1000, 4) try: htableable.update(other_htableable) assert 0, "should not be reached" except ValueError as err: print(str(err))
def test_count_within_radius_big(): inpfile = utils.get_test_data('random-20-a.fa') ht = khmer.Hashbits(20, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) assert n == 3960 ht = khmer.Hashbits(21, 1e6, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) assert n == 39
def test_save_merge_from_disk(self): ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print(divvy) (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize htable = khmer.Hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) htable.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') ksize = 20 # size of kmer htable_size = 100000 # size of hashtableable num_htableables = 3 # number of hashtableables htableable = khmer.Hashbits(ksize, htable_size, num_htableables) n_unique = 0 for _, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - ksize): kmer = sequence[n:n + ksize] if not htableable.get(kmer): n_unique += 1 htableable.count(kmer) assert n_unique == 3960 assert htableable.n_occupied() == 3885, htableable.n_occupied() # this number equals n_unique assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.Hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print('loading partition map from:', partitionmap_file, file=sys.stderr) htable.load_partitionmap(partitionmap_file) for infile in filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def test_save_merge_from_disk_2(self): ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads // 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y assert os.path.exists(outfile1) assert os.path.exists(outfile2) ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_save_load_merge_on_graph(): ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print(divvy) assert len(divvy) is 3 (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_partitionmap(outfile1) # <-- this is different b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def main(): ht = khmer.Hashbits(K, 1, 1) x = [0] * 255 y = [0] * 255 ht.load_stop_tags(sys.argv[1]) for n, record in enumerate(screed.open(sys.argv[2])): if n % 10000 == 0: sys.stderr.write('... %d\n' % n) s, p = ht.trim_on_stoptags(record.sequence) if len(s) == len(record.sequence): continue if p == 0: p = 31 else: p += 1 x[p] += 1 y[len(record.sequence)] += 1 for i, (n, m) in enumerate(zip(x, y)): if m: print('%d,%d,%d' % (i, n, m))
def test_tag_across_stoptraverse(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht = khmer.Hashbits(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. ht.add_stop_tag('CCGAATATATAACAGCGACG') ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across subset = ht.do_subset_partition(0, 0) n, _ = ht.count_partitions() assert n == 99 # reads only connected by traversal... n, _ = ht.subset_count_partitions(subset) assert n == 2 # but need main to cross stoptags. ht.merge_subset(subset) n, _ = ht.count_partitions() # ta-da! assert n == 1, n
def test_extract_unique_paths_2(): kh = khmer.Hashbits(10, 1e5, 4) kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) print x assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 ht2.count(kmer) assert n_unique == 3960 assert ht2.n_occupied() == 3885, ht2.n_occupied() # this number equals n_unique assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
def test__get_set_tag_density(): ht = khmer.Hashbits(32, 1, 1) orig = ht._get_tag_density() assert orig != 2 ht._set_tag_density(2) assert ht._get_tag_density() == 2
def test_tiny_real_partitions(): filename = utils.get_test_data('real-partition-tiny.fa') ht = khmer.Hashbits(32, 8e2, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert len(parts) == 2, len(parts) assert len(set(parts)) == 1 assert set(parts) != set(['0']) test_tiny_real_partitions.runme = True
def test_consume_fasta_and_tag_with_badreads_parser(): presencetable = khmer.Hashbits(6, 1e6, 2) readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa")) try: presencetable.consume_fasta_and_tag_with_reads_parser(readsparser) assert 0, "this should fail" except IOError, e: print str(e)
def test_find_stoptags(): ht = khmer.Hashbits(5, 1, 1) ht.add_stop_tag("AAAAA") assert ht.identify_stoptags_by_position("AAAAA") == [0] assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1] assert ht.identify_stoptags_by_position("TTTTT") == [0] assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
def test_find_radius_for_volume(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.Hashbits(4, 1e6, 2) ht.consume_fasta(inpfile) assert ht.find_radius_for_volume('AAAA', 0, 100) == 0 assert ht.find_radius_for_volume('AAAA', 1, 100) == 0 assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
def main(): filename1 = sys.argv[1] filename2 = sys.argv[2] uniq1 = open(os.path.basename(sys.argv[1]) + '.uniq', 'w') uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w') paths = sys.argv[3] kh1 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) kh1.consume_fasta(filename1) kh2 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) kh2.consume_fasta(filename2) for record in screed.open(paths): n = 0 n_present = 0 path = record.sequence n = len(path) - K + 1 for i in range(n): if kh1.get(path[i:i + K]): n_present += 1 if n_present / float(n) >= THRESHOLD: present1 = True else: present1 = False n = 0 n_present = 0 path = record.sequence n = len(path) - K + 1 for i in range(n): if kh2.get(path[i:i + K]): n_present += 1 if n_present / float(n) >= THRESHOLD: present2 = True else: present2 = False if present1 and not present2: print('>%s\n%s' % (record.name, record.sequence), file=uniq1) elif present2 and not present1: print('>%s\n%s' % (record.name, record.sequence), file=uniq2)
def test_extract_unique_paths_0(): kh = khmer.Hashbits(10, 1e5, 4) x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert not x
def test_count_kmer_degree(): inpfile = utils.get_test_data('all-A.fa') ht = khmer.Hashbits(4, 1e6, 2) ht.consume_fasta(inpfile) assert ht.kmer_degree('AAAA') == 2 assert ht.kmer_degree('AAAT') == 1 assert ht.kmer_degree('AATA') == 0 assert ht.kmer_degree('TAAA') == 1
def test_connected_20_b(self): filename = utils.get_test_data('random-20-b.fa') ht = khmer.Hashbits(20, 1e4, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0) # connected @ 20
def test_connected_31_c(self): filename = utils.get_test_data('random-31-c.fa') ht = khmer.Hashbits(31, 1e5, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0) # connected @ K = 31
def main(): parser = build_construct_args() parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize / 8)' % ( args.n_hashes * args.min_hashsize / 8.), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' outfp = open(outfile, 'w') # create a hashbits data structure ht = khmer.Hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_fasta(inp) print('starting sweep.') n = 0 m = 0 for record in screed.open(readsfile): if len(record.sequence) < K: continue if n % 10000 == 0: print('...', n, m) count = ht.get_median_count(record.sequence)[0] if count: m += 1 outfp.write('>%s\n%s\n' % (record.name, record.sequence)) n += 1
def create_nodegraph(args, ksize=None, multiplier=1.0): if ksize is None: ksize = args.ksize if ksize > 32: print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n") sys.exit(1) tablesize = _calculate_tablesize(args, 'nodegraph', multiplier=multiplier) return khmer.Hashbits(ksize, tablesize, args.n_tables)
def test_merge_from_disk_file_version(self): ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) infile = utils.get_test_data('badversion-k12.ht') try: ht.merge_subset_from_disk(infile) assert 0, "this should fail" except OSError as e: print(str(e))