def test_create_fill_default(): # default value should be int32 max all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() # don't specify default value - table.initialize(all_hashes) # retrieve - what do we get? for hashval, i in zip(all_hashes, range(100, 200)): assert table[hashval] == 2**32 - 1
def test_create_fill_specify(): # test specifying a default value all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() # specify a default value... table.initialize(all_hashes, fill=5) # retrieve - what do we get? for hashval, i in zip(all_hashes, range(100, 200)): assert table[hashval] == 5
def test_create(): # try creating and using a BBHashTable to store hashes and associated vals. all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, i in zip(all_hashes, range(100, 200)): table[hashval] = i for hashval, i in zip(all_hashes, range(100, 200)): assert table[hashval] == i
def test_get_unique_values_set(): # try passing in a set, instead of list all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): table[hashval] = value hashvals_set = set(all_hashes) value_counts = table.get_unique_values(hashvals_set) assert value_counts
def test_get_unique_values(): # test the 'get_unique_values' functionality. all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): table[hashval] = value for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): assert table[hashval] == value value_count = table.get_unique_values(all_hashes) assert value_count[1] == 20 assert value_count[2] == 20 assert value_count[3] == 20 assert value_count[4] == 20 assert value_count[5] == 20
def test_get_unique_values_noexist(): # check to see what happens when we add in hashes that don't exist. all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): table[hashval] = value for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): assert table[hashval] == value # since we're using random, we have to make sure the non-existent hashes # are not present in all_hashes. Convoluted, yes... but saves us from # that one in a bajillion chance of collision making the test fail! :) noexist_hashes = set([random.randint(100, 2**32) for i in range(100)]) noexist_hashes -= set(all_hashes) all_hashes += list(noexist_hashes) value_counts = table.get_unique_values(all_hashes) assert value_counts[1] == 20 assert value_counts[2] == 20 assert value_counts[3] == 20 assert value_counts[4] == 20 assert value_counts[5] == 20 assert len(list(value_counts)) == 5 # compare get_unique_values with boring old for loop value_counts = defaultdict(int) for hashval in all_hashes: value = table[hashval] value_counts[value] += 1 assert value_counts[None] == len(noexist_hashes) assert value_counts[1] == 20 assert value_counts[2] == 20 assert value_counts[3] == 20 assert value_counts[4] == 20 assert value_counts[5] == 20
def test_save_load(tmpdir): # test save & load! all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, i in zip(all_hashes, range(100, 200)): table[hashval] = i mphf_filename = os.path.join(tmpdir, 'table.mphf') array_filename = os.path.join(tmpdir, 'table.array') table.save(mphf_filename, array_filename) table2 = BBHashTable.load(mphf_filename, array_filename) for hashval, i in zip(all_hashes, range(100, 200)): assert table2[hashval] == i
def test_get_unique_values_noexist_fail(): # test requirement that hashes exist all_hashes = [random.randint(100, 2**32) for i in range(100)] print(len(all_hashes)) table = BBHashTable() table.initialize(all_hashes) for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20): table[hashval] = value noexist_hash = all_hashes[0] + 1 while noexist_hash in all_hashes: noexist_hash += 1 value_counts = table.get_unique_values([noexist_hash]) assert not value_counts with pytest.raises(ValueError) as exc: value_counts = table.get_unique_values([noexist_hash], require_exist=True) print(str(exc))
def main(argv=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('--query', nargs='+', action='append') p.add_argument('--subtract', nargs='+', action='append') p.add_argument('-o', '--output-suffix') p.add_argument('--threshold', type=float, default=DEFAULT_THRESHOLD) p.add_argument('-k', '--ksize', type=int, default=31) args = p.parse_args(argv) if not args.query: print('error, must specify at least one query with --query') sys.exit(-1) if not args.subtract: print('error, must specify at least one subtract with --subtract') sys.exit(-1) args.query = [item for sublist in args.query for item in sublist] args.subtract = [item for sublist in args.subtract for item in sublist] # construct output filename as {query}.suffix output_suffix = args.output_suffix if not output_suffix: output_suffix = '.donut.fa' # load k-mers to subtract all_kmers = list() kh = khmer.Nodetable(args.ksize, 1, 1) for subtract_fn in args.subtract: print('loading:', subtract_fn) for record in screed.open(subtract_fn): all_kmers.extend(kh.get_kmer_hashes(record.sequence)) # now build a minimal perfect hash function for all those k-mers print('building bbhash table') table = BBHashTable(all_kmers, fill=1) del all_kmers # next, iterate over each input and do subtract for queryfile in args.query: output = os.path.basename(queryfile) + output_suffix print('subtracting from {} -> {}'.format(queryfile, output)) outfp = open(output, 'wt') n = 0 bp = 0 n_kept = 0 bp_kept = 0 for n, record in enumerate(screed.open(queryfile)): if n % 100000 == 0: print('...', queryfile, n, n_kept) bp += len(record.sequence) if len(record.sequence) < args.ksize: continue kmers = kh.get_kmer_hashes(record.sequence) present = 0 for k in kmers: if table[k]: present += 1 f = present / len(kmers) if f < args.threshold: # keep? outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) n_kept += 1 bp_kept += len(record.sequence) print('kept {} ({:.1g} Mbp) of {} ({:.1g} Mbp)'.format( n_kept, bp_kept / 1e6, n, bp / 1e6)) return 0
import random, time from bbhash_table import BBHashTable from collections import defaultdict all_kmers = [random.randint(100, 2**32) for i in range(100)] * 10000 table = BBHashTable() table.initialize(all_kmers) for kmer_hash in all_kmers: table[kmer_hash] = kmer_hash # as good a value as any ;) # old style start = time.time() value_count = defaultdict(int) for kmer_hash in all_kmers: value = table[kmer_hash] value_count[value] += 1 end = time.time() old_time = end - start print('old:', end - start) # new style start = time.time() value_count = table.get_unique_values(all_kmers) end = time.time() print('new:', end - start) new_time = end - start print('speedup:', old_time / new_time)