Beispiel #1
0
def test_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    kh = khmer.Countgraph(1, 1, 1)
    try:
        kh.load(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    # set_use_bigcount should still be True after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == 1000
Beispiel #2
0
def test_badget_2():
    countgraph = khmer.Countgraph(6, 1e6, 2)

    countgraph.consume(DNA)

    assert countgraph.get("AGCTTT") == 1

    assert countgraph.get("GATGAG") == 0

    try:
        countgraph.get("AGCTT")
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))
Beispiel #3
0
def test_partition_on_abundance_1():
    print((a, ))
    print((b, ))
    kh = khmer.Countgraph(20, 1e3, 4)
    for i in range(10):
        print(kh.consume_and_tag(a))

    for i in range(10):
        print(kh.consume_and_tag(b))

    # all paths in 'a' and 'b'
    p = kh.do_subset_partition_with_abundance(10, 50)
    x = p.count_partitions()
    assert x == (1, 0)  # one partition, no remainders
Beispiel #4
0
def test_align_nothing():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "ACCAAGGCTCGAGATTTACC"

    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    print(score, graphAlign, readAlign)

    assert trunc
    assert len(graphAlign) == 0
    assert len(readAlign) == 0
Beispiel #5
0
def test_bigcount_abund_dist():
    kh = khmer.Countgraph(18, 1e2, 4)
    tracking = khmer.Nodegraph(18, 1e2, 4)
    kh.set_use_bigcount(True)

    seqpath = utils.get_test_data('test-abund-read-2.fa')

    kh.consume_seqfile(seqpath)

    dist = kh.abundance_distribution(seqpath, tracking)
    print(kh.get('GGTTGACGGGGCTCAGGG'))

    pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]]
    assert dist[1002] == 1, pdist
Beispiel #6
0
def test_align_middle():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "TCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, trusted_cov_cutoff=0,
                                bits_theta=0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume(read)
    _, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, read)
    eq_(graphAlign, read)
    assert not trunc
Beispiel #7
0
def test_save_load_large(ctfile):
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 2**31, 1)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = Countgraph.load(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3966, orig_count
    assert loaded_count == orig_count, loaded_count
Beispiel #8
0
def test_readalign():
    ch = khmer.Countgraph(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    score, graphAlign, readAlign, _ = aligner.align(read)

    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'AGCTAGGTTCGACAAGTCCT')
Beispiel #9
0
def test_find_spectral_error_positions_err():
    hi = khmer.Countgraph(8, 1e6, 2)

    try:
        hi.find_spectral_error_positions(DNA[:6], 1)
        assert 0, "should raise ValueError; too short"
    except ValueError:
        pass

    try:
        hi.find_spectral_error_positions("ACGTACGN", 1)
        assert 0, "should raise ValueError; contains N"
    except ValueError:
        pass
Beispiel #10
0
def test_save_load_occupied(ctfile):
    print('working with', ctfile)
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 1e5, 4)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = khmer.load_countgraph(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3886, orig_count
    assert loaded_count == orig_count, loaded_count
Beispiel #11
0
def test_count_2():
    hi = khmer.Countgraph(12, *ARGS_1m)
    kmer = 'G' * 12
    hashval = hi.hash('G' * 12)

    assert hi.get(kmer) == 0
    assert hi.get(hashval) == 0

    hi.count(kmer)
    assert hi.get(kmer) == 1
    assert hi.get(hashval) == 1

    hi.count(hashval)  # count hashes same as strings
    assert hi.get(kmer) == 2
    assert hi.get(hashval) == 2
Beispiel #12
0
def test_consume_absentfasta_with_reads_parser():
    countgraph = khmer.Countgraph(4, 4**4, 4)
    try:
        countgraph.consume_seqfile_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        countgraph.consume_seqfile_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Beispiel #13
0
def test_badget():
    kh = khmer.Countgraph(6, 4**10, 1)

    DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"

    kh.consume(DNA)

    assert kh.get("AGCTTT") == 1

    assert kh.get("GATGAG") == 0

    try:
        kh.get("AGCTT")
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))
Beispiel #14
0
def test_align_fwd_middle_trunc_2():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT")

    # omit prefix from graph
    ch.consume(read[12:])
    _, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)

    # this will fail, because align_forward chooses the first kmer as the
    # seed.
    assert not readAlign
    assert not graphAlign
    assert trunc
Beispiel #15
0
def test_maxcount():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(False)

    last_count = None
    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        if c == last_count:
            break
        last_count = c

    assert c != 1000, "should not be able to count to 1000: %d" % c
    assert c == MAX_COUNT, c  # this will depend on HashcountType...
Beispiel #16
0
def test_maxcount_with_bigcount():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    last_count = None
    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        if c == last_count:
            break
        last_count = c

    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Beispiel #17
0
def test_maxcount():
    # hashtable should saturate at some point so as not to overflow counter
    kh = khmer.Countgraph(4, 100, 1)

    last_count = None
    for _ in range(0, 10000):
        kh.count('AAAA')
        c = kh.get('AAAA')

        print(last_count, c)
        if c == last_count:
            break
        last_count = c

    assert c != 10000, "should not be able to count to 10000"
    assert c == MAX_COUNT  # this will depend on HashcountType...
Beispiel #18
0
def test_median_at_least_comp():
    K = 20
    C = 4
    hi = khmer.Countgraph(K, 1e6, 2)

    seqs = [
        'ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC',
        'TTAGTTAGTTAGTTAGTTAGCCC'
    ]

    for seq in seqs:
        hi.consume(seq)
        hi.consume(seq)
        hi.consume(seq)

        med, _, _ = hi.get_median_count(seq)
        assert hi.median_at_least(seq, C) is (med >= C)
Beispiel #19
0
def test_partition_on_abundance_3():
    kh = khmer.Countgraph(20, 1e4, 4)
    for _ in range(10):
        print(kh.consume_and_tag(first))

    for _ in range(5):
        print(kh.consume_and_tag(second))

    # this will get paths only in 'a'
    p = kh.do_subset_partition_with_abundance(10, 50)

    # this will get paths only in 'b'
    p = kh.do_subset_partition_with_abundance(5, 10)

    x = p.count_partitions()
    print(x)
    assert x == (2, 2)                  # two partitions, two ignored tags
Beispiel #20
0
def test_simple_readalign():
    ch = khmer.Countgraph(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 2, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACAAGTACC"
    #                      ^^            ^  ^
    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")
    # CCCGGGCCTTTGTCGAACCTTTTTAAAAGC

    score, graphAlign, readAlign, trunc = aligner.align(read)

    #                        AGCTAGGTTCGACAAGT CCT
    #                        ACCTAGGTTCGACAAGTaCC
    #                        --CTAGGTTCGACATGT-CC
    eq_(graphAlign, 'AGCTAGGTTCGACATGTCCT')
    eq_(readAlign, 'ACCTAGGTTCGACAAGTACC')
Beispiel #21
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('fastq_files', nargs='+')
    args = p.parse_args()

    cg = khmer.Countgraph(K, 1e8, 4)

    kept = 0
    hdn = khmer.HashSet(K)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < K:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < K:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
            elif cov == 30:
                contigs = lh.assemble_labeled_path(record.sequence[:K])
                for contig in contigs:
                    for t in translate(contig):
                        for o in extract_orfs(t):
                            if hash(o) not in output:
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
Beispiel #22
0
def test_find_all_tags_list_error():
    ct = khmer.Countgraph(4, 4**4, 4)

    # load each sequence but do not build tags - everything should be empty.
    for record in screed.open(utils.get_test_data('test-graph2.fa')):
        ct.consume(record.sequence)

    try:
        ct.find_all_tags_list("ATA")
        assert False, "a ValueError should be raised for incorrect k-mer size"
    except ValueError:
        pass

    try:
        ct.find_all_tags_list("ATAGA")
        assert False, "a ValueError should be raised for incorrect k-mer size"
    except ValueError:
        pass
def test_badfasta_count_kmers_by_position():
    countgraph = khmer.Countgraph(4, 4**4, 4)
    try:
        countgraph.fasta_count_kmers_by_position()
    except TypeError as err:
        print(str(err))

    filename = utils.get_test_data("test-short.fa")
    try:
        countgraph.fasta_count_kmers_by_position(filename, -1, 0)
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))
    try:
        countgraph.fasta_count_kmers_by_position(filename, 0, -1)
        assert 0, "this should fail"
    except ValueError as err:
        print(str(err))
Beispiel #24
0
def test_fakelump_load_stop_tags_trunc():
    fakelump_fa = utils.get_test_data('fakelump.fa')
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    ht = khmer.Nodegraph(32, 1e5, 4)
    ht.consume_seqfile_and_tag(fakelump_fa)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    (n_partitions, _) = ht.count_partitions()
    assert n_partitions == 1, n_partitions

    # now, break partitions on any k-mer that you see more than once
    # on big excursions, where big excursions are excursions 40 out
    # that encounter more than 82 k-mers.  This should specifically
    # identify our connected sequences in fakelump...

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer.Countgraph(32, 1, 1, primes=[5, 7, 11, 13])

    ht.repartition_largest_partition(counting, EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    ht.save_stop_tags(fakelump_fa_foo)
    data = open(fakelump_fa_foo, 'rb').read()

    fp = open(fakelump_fa_foo, 'wb')
    fp.write(data[:10])
    fp.close()

    # ok, now try loading these stop tags; should fail.
    ht = khmer.Nodegraph(32, 1, 1, primes=[5, 7, 11, 13])
    ht.consume_seqfile_and_tag(fakelump_fa)

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
Beispiel #25
0
def test_complete_2_collision():
    kh = khmer.Countgraph(4, 7, 1)

    n_entries = kh.hashsizes()[0]
    for i in range(0, n_entries):
        s = khmer.reverse_hash(i, 4)
        kh.count(s)

    n_rc_filled = 0
    #  n_fwd_filled = 0

    for i in range(0, 128):
        s = khmer.reverse_hash(i, 4)
        if kh.get(s):  # string hashing is rc aware
            n_rc_filled += 1
    # if kh.get(i):                   # int hashing is not rc aware
    #        n_fwd_filled += 1

    assert n_rc_filled == 128, n_rc_filled
Beispiel #26
0
def test_partition_overlap_1():
    kh = khmer.Countgraph(20, 1e3, 4)
    for i in range(10):
        kh.consume_and_tag(a)

    for i in range(10):
        kh.consume_and_tag(b)

    # this will get paths only in 'a'
    p1 = kh.do_subset_partition_with_abundance(10, 50)

    # this will get paths only in 'a', again -- should be the same!
    p2 = kh.do_subset_partition_with_abundance(10, 50)

    # p1.report_on_partitions()
    # p2.report_on_partitions()

    x = p1.compare_partitions(3, p2, 3)
    assert x == (0, 0, 14), x
Beispiel #27
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Beispiel #28
0
def get_composition(ksize, seq, kmers, norm):
    """ get the composition profile and return a list of kmer counts or normalized kmer counts"""
    try:
        nkmers = 4**ksize
        tablesize = nkmers + 100
        counting_hash = khmer.Countgraph(ksize, tablesize, 1)
        counting_hash.consume(seq)
        composition = [counting_hash.get(kmer) for kmer in kmers]
        if norm == True:
            total = sum(composition)
            nc = []
            for item in composition:
                if item == 0:
                    nc.append(0.0)
                else:
                    nc.append(float(item) / float(total))
                composition = nc
        return composition
    except:
        logging.exception("Could not calculate composition using khmer")
Beispiel #29
0
    def test_two_components(self, random_sequence, K):
        comp1 = Sequence(name='Comp1', sequence=random_sequence())
        comp2 = Sequence(name='Comp2', sequence=random_sequence(exclude=comp1))

        cg = khmer.Countgraph(K, 1e5, 4)
        ptnr = ConditionalPartitioner(cg)
        func = PartitionCoverage(coverage_cutoff=5, graph=cg,
                                 partitioner=ptnr)
        
        for i in range(5):
            print(ptnr.consume(comp1, func=func))
            assert ptnr.n_components == 0
        print(ptnr.consume(comp1, func=func))
        assert ptnr.n_components == 1

        for i in range(5):
            ptnr.consume(comp2, func=func)
            assert ptnr.n_components == 1
        ptnr.consume(comp2, func=func)
        assert ptnr.n_components == 2
Beispiel #30
0
def count_query_kmers(read_file, kmer_hash, tmpdir, final_outfile):
    outfh = open(final_outfile, 'wb')

    for k in kmer_hash.keys():
        echo("k = %d" % (k))
        current_kmer = kmer_hash[k]

        ## indexing CMD: python khmerEnv/bin/load-into-counting.py -k 15 -M 1e10 -T 1 -q kmers.graph readfile
        kmer_graph = tmpdir + "/" + str(k) + "mers.graph"
        os.system("%s -k %d -M 16G -T 1 -q %s %s" %
                  (khmer_path, k, kmer_graph, read_file))
        counts = khmer.Countgraph(k, 100000000000, 1)

        counts.load(kmer_graph)
        for mer in current_kmer:
            mer_count = counts.get(mer)

            outfh.write("%s\t%d\n" % (mer, mer_count))

    outfh.close()