Ejemplo n.º 1
0
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
        filename = utils.get_test_data('random-20-a.fa')

        savefile_ht = utils.get_temp_filename('ht')
        savefile_tags = utils.get_temp_filename('tags')
        outfile = filename + utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht)
        ht.save_tagset(savefile_tags)

        del ht
        ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)

        ht.load(savefile_ht)
        ht.load_tagset(savefile_tags)

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)

        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
Ejemplo n.º 2
0
def test_update_from():
    htableable = khmer.Hashbits(5, 1000, 4)
    other_htableable = khmer.Hashbits(5, 1000, 4)

    assert htableable.get('AAAAA') == 0
    assert htableable.get('GCGCG') == 0
    assert other_htableable.get('AAAAA') == 0
    assert other_htableable.get('GCGCG') == 0

    other_htableable.count('AAAAA')

    assert htableable.get('AAAAA') == 0
    assert htableable.get('GCGCG') == 0
    assert other_htableable.get('AAAAA') == 1
    assert other_htableable.get('GCGCG') == 0

    htableable.count('GCGCG')

    assert htableable.get('AAAAA') == 0
    assert htableable.get('GCGCG') == 1
    assert other_htableable.get('AAAAA') == 1
    assert other_htableable.get('GCGCG') == 0

    htableable.update(other_htableable)

    assert htableable.get('AAAAA') == 1
    assert htableable.get('GCGCG') == 1
    assert other_htableable.get('AAAAA') == 1
    assert other_htableable.get('GCGCG') == 0
Ejemplo n.º 3
0
def test_update_from():
    ht = khmer.Hashbits(5, 1000, 4)
    ht2 = khmer.Hashbits(5, 1000, 4)

    assert ht.get('AAAAA') == 0
    assert ht.get('GCGCG') == 0
    assert ht2.get('AAAAA') == 0
    assert ht2.get('GCGCG') == 0

    ht2.count('AAAAA')

    assert ht.get('AAAAA') == 0
    assert ht.get('GCGCG') == 0
    assert ht2.get('AAAAA') == 1
    assert ht2.get('GCGCG') == 0

    ht.count('GCGCG')

    assert ht.get('AAAAA') == 0
    assert ht.get('GCGCG') == 1
    assert ht2.get('AAAAA') == 1
    assert ht2.get('GCGCG') == 0

    ht.update(ht2)

    assert ht.get('AAAAA') == 1
    assert ht.get('GCGCG') == 1
    assert ht2.get('AAAAA') == 1
    assert ht2.get('GCGCG') == 0
Ejemplo n.º 4
0
def test_bloom_c_2():  # simple one
    K = 4
    HT_SIZE = 10  # use 11
    N_HT1 = 1    # hashtable size = 11
    N_HT2 = 2    # hashtable size = 11,13

    # use only 1 hashtable, no bloom filter
    ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1)
    ht1.count('AAAA')  # 00 00 00 00 = 0
    ht1.count('ACTG')  # 00 10 01 11 =
    assert ht1.n_unique_kmers() == 2
    ht1.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert ht1.n_unique_kmers() == 2
    ht1.count('AGAC')   # 00  11 00 10 # collision  with 2nd kmer
    assert ht1.n_unique_kmers() == 2

    # use two hashtables with 11,13
    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2)
    ht2.count('AAAA')  # 00 00 00 00 = 0

    ht2.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert ht2.n_unique_kmers() == 2
    ht2.count('AACG')  # 00 00 10 11 = 11  # collision with only 1st kmer
    assert ht2.n_unique_kmers() == 3
    ht2.count('AGAC')   # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert ht2.n_unique_kmers() == 3
Ejemplo n.º 5
0
def test_update_from_diff_tablesize():
    ht = khmer.Hashbits(5, 100, 4)
    ht2 = khmer.Hashbits(5, 1000, 4)

    try:
        ht.update(ht2)
        assert 0, "should not be reached"
    except ValueError as err:
        print(str(err))
Ejemplo n.º 6
0
def test_update_from_diff_num_tables():
    htableable = khmer.Hashbits(5, 1000, 3)
    other_htableable = khmer.Hashbits(5, 1000, 4)

    try:
        htableable.update(other_htableable)
        assert 0, "should not be reached"
    except ValueError as err:
        print(str(err))
Ejemplo n.º 7
0
def test_count_within_radius_big():
    inpfile = utils.get_test_data('random-20-a.fa')
    ht = khmer.Hashbits(20, 1e6, 4)

    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
    assert n == 3960

    ht = khmer.Hashbits(21, 1e6, 4)
    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
    assert n == 39
Ejemplo n.º 8
0
    def test_save_merge_from_disk(self):
        ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print(divvy)
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Ejemplo n.º 9
0
def main():
    info('merge-partitions.py', ['graph'])
    args = get_parser().parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    ksize = args.ksize
    htable = khmer.Hashbits(ksize, 1, 1)

    for _ in pmap_files:
        check_input_files(_, args.force)

    check_space(pmap_files, args.force)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        htable.merge_subset_from_disk(pmap_file)

    print('saving merged to', output_file, file=sys.stderr)
    htable.save_partitionmap(output_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
Ejemplo n.º 10
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 3  # number of hashtableables

    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    n_unique = 0
    for _, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - ksize):
            kmer = sequence[n:n + ksize]
            if not htableable.get(kmer):
                n_unique += 1
            htableable.count(kmer)

    assert n_unique == 3960
    assert htableable.n_occupied() == 3885, htableable.n_occupied()

    # this number equals n_unique
    assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
Ejemplo n.º 11
0
def main():
    info('annotate-partitions.py', ['graph'])
    args = get_parser().parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    htable = khmer.Hashbits(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_input_files(partitionmap_file, args.force)
    for _ in filenames:
        check_input_files(_, args.force)

    check_space(filenames, args.force)

    print('loading partition map from:', partitionmap_file, file=sys.stderr)
    htable.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Ejemplo n.º 12
0
    def test_save_merge_from_disk_2(self):
        ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads // 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        assert os.path.exists(outfile1)
        assert os.path.exists(outfile2)
        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Ejemplo n.º 13
0
def test_save_load_merge_on_graph():
    ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
    filename = utils.get_test_data('test-graph2.fa')

    (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
    assert total_reads == 3, total_reads

    divvy = ht.divide_tags_into_subsets(1)
    print(divvy)
    assert len(divvy) is 3
    (a, b, c) = divvy

    outfile1 = utils.get_temp_filename('x.pmap')
    outfile2 = utils.get_temp_filename('y.pmap')

    x = ht.do_subset_partition(a, b)
    ht.save_subset_partitionmap(x, outfile1)
    del x

    y = ht.do_subset_partition(b, 0)
    ht.save_subset_partitionmap(y, outfile2)
    del y

    a = ht.load_partitionmap(outfile1)  # <-- this is different
    b = ht.load_subset_partitionmap(outfile2)

    ht.merge_subset(b)

    outfile = utils.get_temp_filename('out.part')
    n_partitions = ht.output_partitions(filename, outfile)
    assert n_partitions == 1, n_partitions        # combined.
Ejemplo n.º 14
0
def main():
    ht = khmer.Hashbits(K, 1, 1)

    x = [0] * 255
    y = [0] * 255

    ht.load_stop_tags(sys.argv[1])
    for n, record in enumerate(screed.open(sys.argv[2])):
        if n % 10000 == 0:
            sys.stderr.write('... %d\n' % n)

        s, p = ht.trim_on_stoptags(record.sequence)

        if len(s) == len(record.sequence):
            continue

        if p == 0:
            p = 31
        else:
            p += 1

        x[p] += 1
        y[len(record.sequence)] += 1

    for i, (n, m) in enumerate(zip(x, y)):
        if m:
            print('%d,%d,%d' % (i, n, m))
Ejemplo n.º 15
0
def test_tag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.Hashbits(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('CCGAATATATAACAGCGACG')

    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across

    subset = ht.do_subset_partition(0, 0)
    n, _ = ht.count_partitions()
    assert n == 99                       # reads only connected by traversal...

    n, _ = ht.subset_count_partitions(subset)
    assert n == 2                        # but need main to cross stoptags.

    ht.merge_subset(subset)

    n, _ = ht.count_partitions()         # ta-da!
    assert n == 1, n
Ejemplo n.º 16
0
def test_extract_unique_paths_2():
    kh = khmer.Hashbits(10, 1e5, 4)

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print x
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
Ejemplo n.º 17
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
            ht2.count(kmer)

    assert n_unique == 3960
    assert ht2.n_occupied() == 3885, ht2.n_occupied()

    # this number equals n_unique
    assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
Ejemplo n.º 18
0
def test__get_set_tag_density():
    ht = khmer.Hashbits(32, 1, 1)

    orig = ht._get_tag_density()
    assert orig != 2
    ht._set_tag_density(2)
    assert ht._get_tag_density() == 2
Ejemplo n.º 19
0
def test_tiny_real_partitions():
    filename = utils.get_test_data('real-partition-tiny.fa')

    ht = khmer.Hashbits(32, 8e2, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()

    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 2, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])

    test_tiny_real_partitions.runme = True
Ejemplo n.º 20
0
def test_consume_fasta_and_tag_with_badreads_parser():
    presencetable = khmer.Hashbits(6, 1e6, 2)
    readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
    try:
        presencetable.consume_fasta_and_tag_with_reads_parser(readsparser)
        assert 0, "this should fail"
    except IOError, e:
        print str(e)
Ejemplo n.º 21
0
def test_find_stoptags():
    ht = khmer.Hashbits(5, 1, 1)
    ht.add_stop_tag("AAAAA")

    assert ht.identify_stoptags_by_position("AAAAA") == [0]
    assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
    assert ht.identify_stoptags_by_position("TTTTT") == [0]
    assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
Ejemplo n.º 22
0
def test_find_radius_for_volume():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer.Hashbits(4, 1e6, 2)
    ht.consume_fasta(inpfile)

    assert ht.find_radius_for_volume('AAAA', 0, 100) == 0
    assert ht.find_radius_for_volume('AAAA', 1, 100) == 0
    assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
Ejemplo n.º 23
0
def main():
    filename1 = sys.argv[1]
    filename2 = sys.argv[2]
    uniq1 = open(os.path.basename(sys.argv[1]) + '.uniq', 'w')
    uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
    paths = sys.argv[3]

    kh1 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
    kh1.consume_fasta(filename1)
    kh2 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
    kh2.consume_fasta(filename2)

    for record in screed.open(paths):
        n = 0
        n_present = 0

        path = record.sequence
        n = len(path) - K + 1
        for i in range(n):
            if kh1.get(path[i:i + K]):
                n_present += 1

        if n_present / float(n) >= THRESHOLD:
            present1 = True
        else:
            present1 = False

        n = 0
        n_present = 0

        path = record.sequence
        n = len(path) - K + 1
        for i in range(n):
            if kh2.get(path[i:i + K]):
                n_present += 1

        if n_present / float(n) >= THRESHOLD:
            present2 = True
        else:
            present2 = False

        if present1 and not present2:
            print('>%s\n%s' % (record.name, record.sequence), file=uniq1)
        elif present2 and not present1:
            print('>%s\n%s' % (record.name, record.sequence), file=uniq2)
Ejemplo n.º 24
0
def test_extract_unique_paths_0():
    kh = khmer.Hashbits(10, 1e5, 4)

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']

    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert not x
Ejemplo n.º 25
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer.Hashbits(4, 1e6, 2)
    ht.consume_fasta(inpfile)

    assert ht.kmer_degree('AAAA') == 2
    assert ht.kmer_degree('AAAT') == 1
    assert ht.kmer_degree('AATA') == 0
    assert ht.kmer_degree('TAAA') == 1
Ejemplo n.º 26
0
    def test_connected_20_b(self):
        filename = utils.get_test_data('random-20-b.fa')

        ht = khmer.Hashbits(20, 1e4, 4)
        ht.consume_fasta_and_tag(filename)

        subset = ht.do_subset_partition(0, 0)
        x = ht.subset_count_partitions(subset)
        assert x == (1, 0)  # connected @ 20
Ejemplo n.º 27
0
    def test_connected_31_c(self):
        filename = utils.get_test_data('random-31-c.fa')

        ht = khmer.Hashbits(31, 1e5, 4)
        ht.consume_fasta_and_tag(filename)

        subset = ht.do_subset_partition(0, 0)
        x = ht.subset_count_partitions(subset)
        assert x == (1, 0)  # connected @ K = 31
Ejemplo n.º 28
0
def main():
    parser = build_construct_args()
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
            print("** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!", file=sys.stderr)

        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize / 8)' % (
                args.n_hashes * args.min_hashsize / 8.), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    outfp = open(outfile, 'w')

    # create a hashbits data structure
    ht = khmer.Hashbits(K, HT_SIZE, N_HT)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_fasta(inp)

    print('starting sweep.')

    n = 0
    m = 0
    for record in screed.open(readsfile):
        if len(record.sequence) < K:
            continue

        if n % 10000 == 0:
            print('...', n, m)

        count = ht.get_median_count(record.sequence)[0]
        if count:
            m += 1
            outfp.write('>%s\n%s\n' % (record.name, record.sequence))
        n += 1
Ejemplo n.º 29
0
def create_nodegraph(args, ksize=None, multiplier=1.0):
    if ksize is None:
        ksize = args.ksize
    if ksize > 32:
        print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
        sys.exit(1)

    tablesize = _calculate_tablesize(args, 'nodegraph', multiplier=multiplier)
    return khmer.Hashbits(ksize, tablesize, args.n_tables)
Ejemplo n.º 30
0
    def test_merge_from_disk_file_version(self):
        ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
        infile = utils.get_test_data('badversion-k12.ht')

        try:
            ht.merge_subset_from_disk(infile)
            assert 0, "this should fail"
        except OSError as e:
            print(str(e))