Example #1
0
def test_simple_kadian_2():
    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACaGCTATCTCTAGAGCTATG")
    hi.consume("ACAGCTATCTCTAGAGCTATG")
    #           --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 2, x

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACaGCTATCTCTAGAcCTATG")
    hi.consume("ACAGCTATCTCTAGACCTATG")
    #           --^          --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 1, x

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCgCTAGAGCTATG")
    hi.consume("ACTGCTATCGCTAGAGCTATG")
    #                  --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 2, x
Example #2
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer.CountingHash(12, sizes)
    try:
        ht.load(savepath)
    except IOError as err:
        assert 0, 'Should not produce an IOError: ' + str(err)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Example #3
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer.CountingHash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer.CountingHash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._Hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Example #4
0
def test_2_kadian():
    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTAGAcCTATG")
    hi.consume("ACTGCTATCTCTAGACCTATG")
    #           ---------------^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 2, x

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTAGAcCTAtG")
    hi.consume("ACTGCTATCTCTAGACCTATG")
    #           ---------------^---^
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTACtcCTAtG")
    hi.consume("ACTGCTATCTCTACTCCTATG")
    #           --------------^^---^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 2, x

    hi = khmer.CountingHash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTgTCTCTACtcCTAtG")
    hi.consume("ACTGCTGTCTCTACTCCTATG")
    #           ------^-------^^---^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 1, x
Example #5
0
def test_very_short_read():
    short_filename = utils.get_test_data('test-short.fa')
    kh = khmer.CountingHash(9, 4, 1)
    n_reads, n_kmers = kh.consume_fasta(short_filename)
    assert n_reads == 1, n_reads
    assert n_kmers == 0, n_kmers

    kh = khmer.CountingHash(8, 4, 1)
    n_reads, n_kmers = kh.consume_fasta(short_filename)
    assert n_reads == 1, n_reads
    assert n_kmers == 1, n_kmers
Example #6
0
def test_get_kmer_hashes():
    hi = khmer.CountingHash(6, 1e6, 2)

    hi.consume("AAAAAA")
    hashes = hi.get_kmer_hashes("AAAAAA")
    print(hashes)
    assert len(hashes) == 1
    assert hi.get(hashes[0]) == 1

    hi.consume("AAAAAA")
    hashes = hi.get_kmer_hashes("AAAAAA")
    print(hashes)
    assert len(hashes) == 1
    assert hi.get(hashes[0]) == 2

    hi.consume("AAAAAT")
    hashes = hi.get_kmer_hashes("AAAAAAT")
    print(hashes)
    assert len(hashes) == 2
    assert hi.get(hashes[0]) == 2
    assert hi.get(hashes[1]) == 1

    hi.consume("AAAAAT")
    hashes = hi.get_kmer_hashes("AAAAAAT")
    print(hashes)
    assert len(hashes) == 2
    assert hi.get(hashes[0]) == 2
    assert hi.get(hashes[1]) == 2

    hi.consume("AAAAAT")
    hashes = hi.get_kmer_hashes("AAAAAAT")
    print(hashes)
    assert len(hashes) == 2
    assert hi.get(hashes[0]) == 2
    assert hi.get(hashes[1]) == 3
Example #7
0
def test_consume_absentfasta():
    countingtable = khmer.CountingHash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta("absent_file.fa")
        assert 0, "This should fail"
    except OSError as err:
        print(str(err))
Example #8
0
def test_badconsume_and_tag():
    countingtable = khmer.CountingHash(4, 4 ** 4, 4)
    try:
        countingtable.consume_and_tag()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Example #9
0
    def test_abund(self):
        ht = khmer.CountingHash(10, 4**10, 1)

        filename = utils.get_test_data('test-abund-read.fa')
        outname = utils.get_temp_filename('test_abund.out')

        ht.consume_fasta(filename)
        try:
            ht.consume_fasta()
            assert 0, "should fail"
        except TypeError as err:
            print(str(err))
        try:
            ht.consume_fasta("nonexistent")
            assert 0, "should fail"
        except OSError as err:
            print(str(err))
        ht.output_fasta_kmer_pos_freq(filename, outname)
        try:
            ht.output_fasta_kmer_pos_freq()
            assert 0, "should fail"
        except TypeError as err:
            print(str(err))

        fd = open(outname, "r")

        output = fd.readlines()
        assert len(output) == 1

        output = output[0]
        output = output.strip().split()

        assert ['1'] * (114 - 10 + 1) == output

        fd.close()
Example #10
0
def test_3_tables():
    x = list(PRIMES_1m)
    x.append(1000005)

    hi = khmer.CountingHash(12, x)

    GG = 'G' * 12  # forward_hash: 11184810
    assert khmer.forward_hash(GG, 12) == 11184810

    collision_1 = 'AAACGTATGACT'
    assert khmer.forward_hash(collision_1, 12) == 184777

    collision_2 = 'AAATACCGAGCG'
    assert khmer.forward_hash(collision_2, 12) == 76603

    collision_3 = 'AAACGTATCGAG'
    assert khmer.forward_hash(collision_3, 12) == 184755

    # hash(GG) % 1000003 == hash(collision_1)
    # hash(GG) % 1009837 == hash(collision_2)
    # hash(GG) % 1000005 == hash(collision_3)
    hi.consume(GG)
    assert hi.get(GG) == 1

    hi.consume(collision_1)
    assert hi.get(GG) == 1

    hi.consume(collision_2)
    assert hi.get(GG) == 1

    hi.consume(collision_3)
    assert hi.get(GG) == 2
Example #11
0
def test_get_raw_tables():
    ht = khmer.CountingHash(20, 1e5, 4)
    tables = ht.get_raw_tables()

    for size, table in zip(ht.hashsizes(), tables):
        assert isinstance(table, memoryview)
        assert size == len(table)
Example #12
0
def test_find_spectral_error_positions_4():
    hi = khmer.CountingHash(8, 1e6, 2)

    hi.consume(DNA)

    posns = hi.find_spectral_error_positions(DNA, 2)
    assert posns == [], posns
Example #13
0
def test_badhashsizes():
    countingtable = khmer.CountingHash(4, 4 ** 4, 4)
    try:
        countingtable.hashsizes(True)
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Example #14
0
def test_get_kmer_counts():
    hi = khmer.CountingHash(6, 1e6, 2)

    hi.consume("AAAAAA")
    counts = hi.get_kmer_counts("AAAAAA")
    print(counts)
    assert len(counts) == 1
    assert counts[0] == 1

    hi.consume("AAAAAA")
    counts = hi.get_kmer_counts("AAAAAA")
    print(counts)
    assert len(counts) == 1
    assert counts[0] == 2

    hi.consume("AAAAAT")
    counts = hi.get_kmer_counts("AAAAAAT")
    print(counts)
    assert len(counts) == 2
    assert counts[0] == 2
    assert counts[1] == 1

    hi.consume("AAAAAT")
    counts = hi.get_kmer_counts("AAAAAAT")
    print(counts)
    assert len(counts) == 2
    assert counts[0] == 2
    assert counts[1] == 2

    hi.consume("AAAAAT")
    counts = hi.get_kmer_counts("AAAAAAT")
    print(counts)
    assert len(counts) == 2
    assert counts[0] == 2
    assert counts[1] == 3
Example #15
0
def test_partition_overlap_2():
    kh = khmer.CountingHash(20, 1e4, 4)
    for i in range(10):
        kh.consume_and_tag(a)

    for i in range(5):
        kh.consume_and_tag(b)

    # this will get paths only in 'a'
    p1 = kh.do_subset_partition_with_abundance(10, 50)

    # this will get paths only in 'b'
    p2 = kh.do_subset_partition_with_abundance(5, 10)

    # p1.report_on_partitions()
    # p2.report_on_partitions()

    x = p1.compare_partitions(3, p2, 3)
    assert x == (8, 6, 0), x

    x = p1.compare_partitions(3, p2, 5)
    assert x == (2, 0, 6), x

    x = p1.partition_sizes()
    assert x == ([(3, 8)], 0), x

    x = p2.partition_sizes()
    assert x == ([(3, 6), (5, 6)], 2), x

    x = p1.partition_average_coverages(kh)
    assert x == [(3, 11)]

    x = p2.partition_average_coverages(kh)
    assert x == [(3, 5), (5, 10)], x
Example #16
0
def test_median_at_least_exception():
    ht = khmer.CountingHash(20, 1e6, 2)
    try:
        ht.median_at_least('ATGGCTGATCGAT', 1)
        assert 0, "should have thrown ValueError"
    except ValueError as e:
        pass
Example #17
0
def test_64bitshift():
    kh = khmer.CountingHash(25, 4, 1)
    fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"
    substr = "ATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGC"

    kh.consume(fullstr)
    assert 0 < kh.get_min_count(substr), kh.get_min_count(substr)
Example #18
0
def test_counting_load_bigcount():
    count_table = khmer.CountingHash(10, 1e5, 4)
    count_table.set_use_bigcount(True)
    for i in range(500):
        print(i, count_table.count('ATATATATAT'))
    count = count_table.get('ATATATATAT')
    assert count == 500
def main():
    parser = build_construct_args()
    parser.add_argument('-l',
                        '--lower-cutoff',
                        type=int,
                        dest='lower_cutoff',
                        default=DEFAULT_LOWER_CUTOFF)
    parser.add_argument('-u',
                        '--upper-cutoff',
                        type=int,
                        dest='upper_cutoff',
                        default=DEFAULT_UPPER_CUTOFF)

    parser.add_argument('output_filename')
    parser.add_argument('input_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
            print("** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!", file=sys.stderr)

        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes " \
            "(n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize),
              file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    output = args.output_filename
    input = args.input_filename

    print('lower cutoff:', args.lower_cutoff)
    print('upper cutoff:', args.upper_cutoff)
    print('Saving stoptags to %s' % output)
    print('Loading sequences in %s' % input)

    ###

    print('making hashtable')
    ht = khmer.CountingHash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    print('consuming input', input)
    hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff,
                                         args.upper_cutoff)

    print('saving stoptags', output)
    hb.save_stop_tags(output)
Example #20
0
def test_bigcount_overflow():
    kh = khmer.CountingHash(18, 1e7, 4)
    kh.set_use_bigcount(True)

    for i in range(0, 70000):
        kh.count('GGTTGACGGGGCTCAGGG')

    assert kh.get('GGTTGACGGGGCTCAGGG') == MAX_BIGCOUNT
Example #21
0
def test_trim_full():
    hi = khmer.CountingHash(6, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA)

    seq, pos = hi.trim_on_abundance(DNA, 2)
    assert DNA == seq, seq
Example #22
0
def test_consume_fasta_and_tag():
    countingtable = khmer.CountingHash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta_and_tag()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
Example #23
0
def test_get_raw_tables_view():
    ht = khmer.CountingHash(20, 1e5, 4)
    tables = ht.get_raw_tables()
    for tab in tables:
        assert sum(tab.tolist()) == 0
    ht.consume('AAAATTTTCCCCGGGGAAAA')
    for tab in tables:
        assert sum(tab.tolist()) == 1
Example #24
0
def test_get_kmers():
    hi = khmer.CountingHash(6, 1e6, 2)

    kmers = hi.get_kmers("AAAAAA")
    assert kmers == ["AAAAAA"]

    kmers = hi.get_kmers("AAAAAAT")
    assert kmers == ["AAAAAA", "AAAAAT"]
Example #25
0
def test_64bitshift_2():
    kh = khmer.CountingHash(25, 4, 1)
    fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"

    kh.consume(fullstr)
    for i in range(len(fullstr) - 25 + 1):
        substr = fullstr[i:i + 25]
        assert kh.get(substr) > 0
Example #26
0
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.CountingHash(12, 1000, 2)
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Example #27
0
def test_median_too_short():
    hi = khmer.CountingHash(6, 1e6, 2)

    hi.consume("AAAAAA")
    try:
        hi.get_median_count("A")
        assert 0, "this should fail"
    except ValueError:
        pass
Example #28
0
def test_bad_use_bigcount():
    countingtable = khmer.CountingHash(4, 4 ** 4, 4)
    countingtable.set_use_bigcount(True)
    assert countingtable.get_use_bigcount()
    try:
        countingtable.get_use_bigcount(True)
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Example #29
0
def test_find_spectral_error_positions_5():
    hi = khmer.CountingHash(8, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA[:10])
    hi.consume(DNA[11:])

    posns = hi.find_spectral_error_positions(DNA, 1)
    assert posns == [10], posns
Example #30
0
def create_countgraph(args, ksize=None, multiplier=1.0):
    if ksize is None:
        ksize = args.ksize
    if ksize > 32:
        print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
        sys.exit(1)

    tablesize = _calculate_tablesize(args, 'countgraph', multiplier=multiplier)
    return khmer.CountingHash(ksize, tablesize, args.n_tables)