Beispiel #1
0
def test_create_fill_default():
    # default value should be int32 max
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    # don't specify default value -
    table.initialize(all_hashes)

    # retrieve - what do we get?
    for hashval, i in zip(all_hashes, range(100, 200)):
        assert table[hashval] == 2**32 - 1
Beispiel #2
0
def test_create_fill_specify():
    # test specifying a default value
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    # specify a default value...
    table.initialize(all_hashes, fill=5)

    # retrieve - what do we get?
    for hashval, i in zip(all_hashes, range(100, 200)):
        assert table[hashval] == 5
Beispiel #3
0
def test_create():
    # try creating and using a BBHashTable to store hashes and associated vals.
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, i in zip(all_hashes, range(100, 200)):
        table[hashval] = i

    for hashval, i in zip(all_hashes, range(100, 200)):
        assert table[hashval] == i
Beispiel #4
0
def test_get_unique_values_set():
    # try passing in a set, instead of list
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        table[hashval] = value

    hashvals_set = set(all_hashes)

    value_counts = table.get_unique_values(hashvals_set)
    assert value_counts
Beispiel #5
0
def test_get_unique_values():
    # test the 'get_unique_values' functionality.
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        table[hashval] = value

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        assert table[hashval] == value

    value_count = table.get_unique_values(all_hashes)
    assert value_count[1] == 20
    assert value_count[2] == 20
    assert value_count[3] == 20
    assert value_count[4] == 20
    assert value_count[5] == 20
Beispiel #6
0
def test_get_unique_values_noexist():
    # check to see what happens when we add in hashes that don't exist.
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        table[hashval] = value

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        assert table[hashval] == value

    # since we're using random, we have to make sure the non-existent hashes
    # are not present in all_hashes. Convoluted, yes... but saves us from
    # that one in a bajillion chance of collision making the test fail! :)
    noexist_hashes = set([random.randint(100, 2**32) for i in range(100)])
    noexist_hashes -= set(all_hashes)
    all_hashes += list(noexist_hashes)
    value_counts = table.get_unique_values(all_hashes)
    assert value_counts[1] == 20
    assert value_counts[2] == 20
    assert value_counts[3] == 20
    assert value_counts[4] == 20
    assert value_counts[5] == 20
    assert len(list(value_counts)) == 5

    # compare get_unique_values with boring old for loop
    value_counts = defaultdict(int)
    for hashval in all_hashes:
        value = table[hashval]
        value_counts[value] += 1

    assert value_counts[None] == len(noexist_hashes)
    assert value_counts[1] == 20
    assert value_counts[2] == 20
    assert value_counts[3] == 20
    assert value_counts[4] == 20
    assert value_counts[5] == 20
Beispiel #7
0
def test_save_load(tmpdir):
    # test save & load!
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, i in zip(all_hashes, range(100, 200)):
        table[hashval] = i

    mphf_filename = os.path.join(tmpdir, 'table.mphf')
    array_filename = os.path.join(tmpdir, 'table.array')

    table.save(mphf_filename, array_filename)

    table2 = BBHashTable.load(mphf_filename, array_filename)

    for hashval, i in zip(all_hashes, range(100, 200)):
        assert table2[hashval] == i
Beispiel #8
0
def test_get_unique_values_noexist_fail():
    # test requirement that hashes exist
    all_hashes = [random.randint(100, 2**32) for i in range(100)]
    print(len(all_hashes))

    table = BBHashTable()
    table.initialize(all_hashes)

    for hashval, value in zip(all_hashes, [1, 2, 3, 4, 5] * 20):
        table[hashval] = value

    noexist_hash = all_hashes[0] + 1
    while noexist_hash in all_hashes:
        noexist_hash += 1

    value_counts = table.get_unique_values([noexist_hash])
    assert not value_counts

    with pytest.raises(ValueError) as exc:
        value_counts = table.get_unique_values([noexist_hash],
                                               require_exist=True)
    print(str(exc))
Beispiel #9
0
def main(argv=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--subtract', nargs='+', action='append')
    p.add_argument('-o', '--output-suffix')
    p.add_argument('--threshold', type=float, default=DEFAULT_THRESHOLD)
    p.add_argument('-k', '--ksize', type=int, default=31)
    args = p.parse_args(argv)

    if not args.query:
        print('error, must specify at least one query with --query')
        sys.exit(-1)

    if not args.subtract:
        print('error, must specify at least one subtract with --subtract')
        sys.exit(-1)

    args.query = [item for sublist in args.query for item in sublist]
    args.subtract = [item for sublist in args.subtract for item in sublist]

    # construct output filename as {query}.suffix
    output_suffix = args.output_suffix
    if not output_suffix:
        output_suffix = '.donut.fa'

    # load k-mers to subtract
    all_kmers = list()
    kh = khmer.Nodetable(args.ksize, 1, 1)

    for subtract_fn in args.subtract:
        print('loading:', subtract_fn)
        for record in screed.open(subtract_fn):
            all_kmers.extend(kh.get_kmer_hashes(record.sequence))

    # now build a minimal perfect hash function for all those k-mers
    print('building bbhash table')
    table = BBHashTable(all_kmers, fill=1)
    del all_kmers

    # next, iterate over each input and do subtract
    for queryfile in args.query:
        output = os.path.basename(queryfile) + output_suffix
        print('subtracting from {} -> {}'.format(queryfile, output))
        outfp = open(output, 'wt')
        n = 0
        bp = 0
        n_kept = 0
        bp_kept = 0
        for n, record in enumerate(screed.open(queryfile)):
            if n % 100000 == 0:
                print('...', queryfile, n, n_kept)

            bp += len(record.sequence)

            if len(record.sequence) < args.ksize:
                continue

            kmers = kh.get_kmer_hashes(record.sequence)

            present = 0
            for k in kmers:
                if table[k]:
                    present += 1

            f = present / len(kmers)
            if f < args.threshold:  # keep?
                outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
                n_kept += 1
                bp_kept += len(record.sequence)

        print('kept {} ({:.1g} Mbp) of {} ({:.1g} Mbp)'.format(
            n_kept, bp_kept / 1e6, n, bp / 1e6))

    return 0
Beispiel #10
0
import random, time
from bbhash_table import BBHashTable
from collections import defaultdict

all_kmers = [random.randint(100, 2**32) for i in range(100)] * 10000

table = BBHashTable()
table.initialize(all_kmers)

for kmer_hash in all_kmers:
    table[kmer_hash] = kmer_hash  # as good a value as any ;)

# old style
start = time.time()
value_count = defaultdict(int)
for kmer_hash in all_kmers:
    value = table[kmer_hash]
    value_count[value] += 1
end = time.time()
old_time = end - start
print('old:', end - start)

# new style
start = time.time()
value_count = table.get_unique_values(all_kmers)
end = time.time()
print('new:', end - start)
new_time = end - start

print('speedup:', old_time / new_time)