def test_simple_kadian_2():
    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACaGCTATCTCTAGAGCTATG")
    hi.consume("ACAGCTATCTCTAGAGCTATG")
    #           --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 2, x

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACaGCTATCTCTAGAcCTATG")
    hi.consume("ACAGCTATCTCTAGACCTATG")
    #           --^          --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 1, x

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCgCTAGAGCTATG")
    hi.consume("ACTGCTATCGCTAGAGCTATG")
    #                  --^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
    assert x == 2, x
def test_2_kadian():
    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTAGAcCTATG")
    hi.consume("ACTGCTATCTCTAGACCTATG")
    #           ---------------^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 2, x

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTAGAcCTAtG")
    hi.consume("ACTGCTATCTCTAGACCTATG")
    #           ---------------^---^
    assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTATCTCTACtcCTAtG")
    hi.consume("ACTGCTATCTCTACTCCTATG")
    #           --------------^^---^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 2, x

    hi = khmer.new_counting_hash(6, 1e6, 2)
    hi.consume("ACTGCTATCTCTAGAGCTATG")
    # hi.consume("ACTGCTgTCTCTACtcCTAtG")
    hi.consume("ACTGCTGTCTCTACTCCTATG")
    #           ------^-------^^---^
    x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
    assert x == 1, x
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('-s', '--sampling-rate', type=int, default=10000)
    parser.add_argument('-M', '--max-reads', type=int, default=None)
    parser.add_argument('-m', '--min-kmer-count', type=int, default=2)
    parser.add_argument('rrna_file')
    parser.add_argument('reads_file')
    parser.add_argument('output')
    
    args = parser.parse_args()

    K = args.ksize
    rrna_file = args.rrna_file
    reads_file = args.reads_file
    output = args.output

    print 'reading', rrna_file
    ht = khmer.new_counting_hash(K, 1e8)
    ht.consume_fasta(rrna_file)

    print 'iterating over kmers'
    unique_kmers = set()
    for record in screed.open(rrna_file):
        seq = record.sequence
        for i in range(len(seq) - K + 1):
            kmer = seq[i:i+K]
            count = ht.get(kmer)
            if count >= args.min_kmer_count:
                unique_kmers.add(kmer)

    print len(unique_kmers), 'unique kmers'

    ###

    fp = open(output, 'w')
    ht = khmer.new_counting_hash(K, 1e10)
    total_bp = 0
    for n, record in enumerate(screed.open(reads_file)):
        ht.consume(record.sequence)
        total_bp += len(record.sequence)

        if n % args.sampling_rate == 0:
            if args.max_reads and n > args.max_reads:
                break
            i = 0
            for kmer in unique_kmers:
                if ht.get(kmer) > 0:
                    i += 1

            print '...', n, total_bp, i, float(i) / float(len(unique_kmers)) * 100.
            print >>fp, n, total_bp, i, float(i) / float(len(unique_kmers)) * 100.
def main():
    htReads_filename = sys.argv[1]
    htExons_filename = sys.argv[2]
    contig_filename = sys.argv[3]

    print>>sys.stderr, 'loading ht from', htReads_filename
    htReads = khmer.new_counting_hash(K, 1, N_HT)
    htReads.load(htReads_filename)
    print >> sys.stderr, 'loading ht from', htExons_filename
    htExons = khmer.new_counting_hash(K, 1, N_HT)
    htExons.load(htExons_filename)

#    countExons = htExons.n_entries()
#    countReads = htReads.n_entries()
#    print countExons
#    print countReads

    print >> sys.stderr, 'Beginning kmer count'

    for record in screed.open(contig_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')
    
        medianCounts = []
        sum = 0
        for i in range (0, len(seq) - K):
            a, b, c = htReads.get_median_count(seq[i:i+K])
            d, e, f = htExons.get_median_count(seq[i:i+K])
            if d < 2:
                if a > 0:
                    medianCounts.append(a)
#            medianCounts.append(a / d) if a and d else medianCounts.append(a)

        
        if len(medianCounts) > len(seq) / 10:
            medianCounts.sort()
            for i in range(0, len(medianCounts)):
                sum += medianCounts[i]
            average = sum / len(medianCounts)
            if len(medianCounts) % 2:
                median = medianCounts[len(medianCounts) // 2]
            else:
                median = float(medianCounts[len(medianCounts) / 2 - 1] + medianCounts[len(medianCounts) / 2]) / 2
        else:
            median = -1
            average = -1

        print '%s %6.0f %6.0f %1.0f' % (record.name, median, average, len(seq))
Exemple #5
0
def main():
    parser = build_common_args()
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parse_args(parser)

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###
    
    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    for n, filename in enumerate(filenames):
       print 'consuming input', filename
       ht.consume_fasta(filename)

       if n > 0 and n % 10 == 0:
           print 'mid-save', base
           ht.save(base)
           open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)
    open(base + '.info', 'w').write('through end: %s' % filename)
def test_counting_load_bigcount():
    count_table = khmer.new_counting_hash(10, 1e5, 4)
    count_table.set_use_bigcount(True)
    for i in range(500):
        print i, count_table.count('ATATATATAT')
    count = count_table.get('ATATATATAT')
    assert count == 500
Exemple #7
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]
        
    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
       print 'filtering', infile
       outfile = infile + '.abundfilt'

       outfp = open(outfile, 'w')

       def process_fn(record, ht=ht):
          name = record['name']
          seq = record['sequence']
          if 'N' in seq:
              return None, None

          trim_seq, trim_at = ht.trim_on_abundance(seq, 2)

          if trim_at >= K:
              return name, trim_seq

          return None, None

       tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

       tsp.start(verbose_fasta_iter(infile), outfp)
def main(filename):
    global ht

    basename = os.path.basename(filename)

    print 'input file to partition: %s' % filename
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print '--'

    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    ht.consume_fasta(filename)

    counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT)
    ht.traverse_from_reads(filename, 100, 5000, 5, counting)

    print 'saving stoptags binary'
    ht.save_stop_tags(basename + '.stoptags')
    print 'saving stoptags text'
    ht.print_stop_tags(basename + '.stoptags.txt')

    sys.exit(0)
def test_partition_overlap_2():
    kh = khmer.new_counting_hash(20, 1e4, 4)
    for i in range(10):
        kh.consume_and_tag(a)

    for i in range(5):
        kh.consume_and_tag(b)

    # this will get paths only in 'a'
    p1 = kh.do_subset_partition_with_abundance(10, 50)

    # this will get paths only in 'b'
    p2 = kh.do_subset_partition_with_abundance(5, 10)

    # p1.report_on_partitions()
    # p2.report_on_partitions()

    x = p1.compare_partitions(3, p2, 3)
    assert x == (8, 6, 0), x

    x = p1.compare_partitions(3, p2, 5)
    assert x == (2, 0, 6), x

    x = p1.partition_sizes()
    assert x == ([(3, 8)], 0), x

    x = p2.partition_sizes()
    assert x == ([(3, 6), (5, 6)], 2), x

    x = p1.partition_average_coverages(kh)
    assert x == [(3, 11)]

    x = p2.partition_average_coverages(kh)
    assert x == [(3, 5), (5, 10)], x
def test_find_spectral_error_positions_4():
    hi = khmer.new_counting_hash(8, 1e6, 2)

    hi.consume(DNA)

    posns = hi.find_spectral_error_positions(DNA, 2)
    assert posns == [], posns
Exemple #11
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
def test_get_raw_tables():
    ht = khmer.new_counting_hash(20, 1e5, 4)
    tables = ht.get_raw_tables()

    for size, table in zip(ht.hashsizes(), tables):
        assert isinstance(table, buffer)
        assert size == len(table)
def test_consume_absentfasta():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta("absent_file.fa")
        assert 0, "This should fail"
    except IOError, err:
        print str(err)
def test_get_badkadian_count():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.get_kadian_count()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def test_badhashsizes():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.hashsizes(True)
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def test_badsave():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.save()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
def main():
    ht_filename = sys.argv[1]
    contig_filename = sys.argv[2]

    print>>sys.stderr, 'loading ht from', ht_filename
    ht = khmer.new_counting_hash(K, 1, N_HT)
    ht.load(ht_filename)

    partition_counts = {}

    for record in screed.open(contig_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        a, b, c = ht.get_median_count(seq)

        partition = record.name.strip().split()[-1]

        x = partition_counts.get(partition, [])
        x.append(a)
        partition_counts[partition] = x

    for k, x in partition_counts.iteritems():
        if len(x) < PARTITION_SIZE_LIMIT:
            continue

        fp = open('partition%s.counts' % k, 'w')
        for i in x:
            fp.write("%s\n" % i)
        fp.close()
def test_consume_fasta_and_tag():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta_and_tag()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def test_badconsume_high_abund_kmers():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.consume_high_abund_kmers()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def count(contigs1, contigs2):
    ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT)
    count = 0
    count2 = 0
    count3 = 0
    for n in contigs1:
        if len(n) >= LENGTH_THRESHOLD:
            kmer1 = slidingWindow(n, K)
            for x in kmer1:
                count += 1
                if ht.get(x):
                    continue
                ht.consume(x)

    for n in contigs2:
        if len(n) >= LENGTH_THRESHOLD:
            kmer2 = slidingWindow(n, K)
            for x in kmer2:
                count2 += 1
                if ht.get(x) > 0:
                    count3 += 1

    # 'count' is the total number of kmers in the first file
    # 'count2' is the total number of kmers in the second file
    # 'count3' is the total number of kmers shared between the two files.
    print count, count2, count3, "%.1f%%" % (count3 / float(count) * 100.)
def test_consume_absentfasta_with_reads_parser():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def main():
    parser = build_common_args()
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')

    args = parse_args(parser)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    input_name_list = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in input_name_list:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')

            med, _, _ = ht.get_median_count(seq)

            if med < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of', \
            total, 'or', int(100. - discarded / float(total) * 100.), '%'

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(os.path.basename(args.savehash))
def test_trim_full():
    hi = khmer.new_counting_hash(6, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA)

    seq, pos = hi.trim_on_abundance(DNA, 2)
    assert DNA == seq, seq
def test_consume_fasta_and_tag():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    try:
        countingtable.consume_fasta_and_tag()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
    countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
def test_bigcount_overflow():
    kh = khmer.new_counting_hash(18, 1e7, 4)
    kh.set_use_bigcount(True)

    for i in range(0, 70000):
        kh.count('GGTTGACGGGGCTCAGGG')

    assert kh.get('GGTTGACGGGGCTCAGGG') == MAX_BIGCOUNT
def main():
    parser = argparse.ArgumentParser(
        description="Find an initial set of highly connected k-mers.")

    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=DEFAULT_COUNTING_HT_N,
                        help='number of counting hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
                        default=DEFAULT_COUNTING_HT_SIZE,
                        help='lower bound on counting hashsize to use')
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (default 1e4 is prob ok)')
    parser.add_argument('--stoptags', '-S', dest='stoptags', default='',
                        help="Use stoptags in this file during partitioning")

    parser.add_argument('graphbase')

    args = parser.parse_args()

    graphbase = args.graphbase

    print 'loading ht %s.ht' % graphbase
    ht = khmer.load_hashbits(graphbase + '.ht')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        ht.load_stop_tags(args.stoptags)

    print 'loading tagset %s.tagset...' % graphbase
    ht.load_tagset(graphbase + '.tagset')

    K = ht.ksize()
    counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes)

    # divide up into SUBSET_SIZE fragments
    divvy = ht.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print 'doing pre-partitioning from', start, 'to', end
    subset = ht.do_subset_partition(start, end)

    # now, repartition...
    print 'repartitioning to find HCKs.'
    ht.repartition_largest_partition(subset, counting,
                                     EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    print 'saving stop tags'
    ht.save_stop_tags(graphbase + '.stoptags')
Exemple #27
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht')

    hi = khmer.new_counting_hash(12, 1000)
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except IOError, e:
        print str(e)
Exemple #28
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('temphashbitssave0.ht')

    hi = khmer.new_counting_hash(12, 2)
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except IOError:
        pass
Exemple #29
0
def test_badtrim():
    countingtable = khmer.new_counting_hash(6, 1e6, 2)

    countingtable.consume(DNA)
    try:
        countingtable.trim_on_abundance()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
Exemple #30
0
def test_find_spectral_error_positions_5():
    hi = khmer.new_counting_hash(8, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA[:10])
    hi.consume(DNA[11:])

    posns = hi.find_spectral_error_positions(DNA, 1)
    assert posns == [10], posns
Exemple #31
0
def test_bad_use_bigcount():
    countingtable = khmer.new_counting_hash(4, 4**4, 4)
    countingtable.set_use_bigcount(True)
    assert countingtable.get_use_bigcount()
    try:
        countingtable.get_use_bigcount(True)
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
Exemple #32
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.new_counting_hash(4, 4 ** 4, 4)
    kh.set_use_bigcount(True)

    for i in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    kh = khmer.new_counting_hash(1, 1, 1)
    kh.load(savepath)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Exemple #33
0
def test_median_too_short():
    hi = khmer.new_counting_hash(6, 1e6, 2)

    hi.consume("AAAAAA")
    try:
        hi.get_median_count("A")
        assert 0, "this should fail"
    except ValueError:
        pass
def test_badtrim():
    countingtable = khmer.new_counting_hash(6, 1e6, 2)

    countingtable.consume(DNA)
    try:
        countingtable.trim_on_abundance()
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('temphashbitssave0.ht')

    hi = khmer.new_counting_hash(12, 2)
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except IOError:
        pass
def test_bad_use_bigcount():
    countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
    countingtable.set_use_bigcount(True)
    assert countingtable.get_use_bigcount()
    try:
        countingtable.get_use_bigcount(True)
        assert 0, "this should fail"
    except TypeError, err:
        print str(err)
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.new_counting_hash(12, 1000)
    try:
        hi.load(savepath)
        assert 0, "load should fail"
    except IOError, e:
        print str(e)
Exemple #38
0
def test_nobigcount_save():
    kh = khmer.new_counting_hash(4, 4**4, 4)
    # kh.set_use_bigcount(False) <-- this is the default

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    kh = khmer.new_counting_hash(1, 1, 1)
    kh.load(savepath)

    # set_use_bigcount should still be False after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for i in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == MAX_COUNT
Exemple #39
0
def test_counting_gz_file_version_check():
    ht = khmer.new_counting_hash(12, 1, 1)

    inpath = utils.get_test_data('badversion-k12.ct.gz')

    try:
        ht.load(inpath)
        assert 0, "this should fail"
    except IOError as e:
        print str(e)
Exemple #40
0
def test_get_raw_tables_view():
    ht = khmer.new_counting_hash(20, 1e5, 4)
    tables = ht.get_raw_tables()
    for tab in tables:
        memv = memoryview(tab)
        assert sum(memv.tolist()) == 0
    ht.consume('AAAATTTTCCCCGGGGAAAA')
    for tab in tables:
        memv = memoryview(tab)
        assert sum(memv.tolist()) == 1
Exemple #41
0
def test_counting_file_type_check():
    inpath = utils.get_test_data('goodversion-k12.ht')

    kh = khmer.new_counting_hash(12, 1, 1)

    try:
        kh.load(inpath)
        assert 0, "this should fail"
    except IOError as e:
        print str(e)
Exemple #42
0
def test_trim_short():
    hi = khmer.new_counting_hash(6, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA[:50])

    seq, pos = hi.trim_on_abundance(DNA, 2)
    assert DNA[:50] == seq, (seq, pos)
    assert hi.get(seq[-6:]) == 2
    assert hi.get(DNA[:51][-6:]) == 1
def main():
    parser = build_construct_args()
    parser.add_argument('-l',
                        '--lower-cutoff',
                        type=int,
                        dest='lower_cutoff',
                        default=DEFAULT_LOWER_CUTOFF)
    parser.add_argument('-u',
                        '--upper-cutoff',
                        type=int,
                        dest='upper_cutoff',
                        default=DEFAULT_UPPER_CUTOFF)

    parser.add_argument('output_filename')
    parser.add_argument('input_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    output = args.output_filename
    input = args.input_filename

    print 'lower cutoff:', args.lower_cutoff
    print 'upper cutoff:', args.upper_cutoff
    print 'Saving stoptags to %s' % output
    print 'Loading sequences in %s' % input

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    print 'consuming input', input
    hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff,
                                         args.upper_cutoff)

    print 'saving stoptags', output
    hb.save_stop_tags(output)
Exemple #44
0
def count_median(K,HT_SIZE,N_HT,filename,fileout):

    count = 0
    for n, record in enumerate(screed.open(filename)):
        count = count+1
    max_count = count/20
    print max_count

    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)
#    seq_array = []
    seq_count = 0
    median_array = [6,7,8,9,10,11,12,13,14,15]
    med={}
    
    for median in median_array:
        med[median] = 0
    #print med
    count = 0
    for n, record in enumerate(screed.open(filename)):
        sequence = record['sequence']
        ht.consume(sequence)
#        seq_array.append(sequence)
        seq_count = seq_count + 1
        if seq_count == max_count:
            count = count+1
            number_of_sequence_consumed = max_count*count
            counted_sequence = 0
            #print number_of_sequence_consumed
            for n2,record2 in enumerate(screed.open(filename)):
                counted_sequence = counted_sequence+1
                sequence2 = record2['sequence']
                #print sequence2
#for seq in seq_array:
                a, b, c = ht.get_median_count(sequence2)
                #print a,b,c
                for median in median_array:
                    if a == median:
                        #print "hit!"
                        med[a] = med[a]+1
                if counted_sequence == number_of_sequence_consumed:
                    break
                
            #print med
            fileout_obj = open(fileout,'a')
            print_line = str(number_of_sequence_consumed)
            for median in median_array:
                print_line = print_line+ '\t'+str(med[median])+'\t'
            print_line = print_line+'\n'
            fileout_obj.write(print_line)
            fileout_obj.close()
            seq_count = 0
            med={}
            for median in median_array:
                med[median] = 0
Exemple #45
0
def test_find_spectral_error_positions_6():
    hi = khmer.new_counting_hash(8, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA[1:])

    for n in range(len(DNA) - 8 + 1):
        print n, hi.get(DNA[n:n + 8])

    posns = hi.find_spectral_error_positions(DNA, 1)
    assert posns == [0], posns
Exemple #46
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, 'loading htable %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print >>sys.stderr, 'repartitioning to find HCKs.'
    htable.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

    print >>sys.stderr, 'saving stop tags'
    htable.save_stop_tags(graphbase + '.stoptags')
    print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
Exemple #48
0
def test_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.new_counting_hash(4, 4**4, 4)
    kh.set_use_bigcount(True)

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    kh = khmer.new_counting_hash(1, 1, 1)
    kh.load(savepath)

    # set_use_bigcount should still be True after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for i in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == 1000
def get_composition(seq, kmers, norm):
    counting_hash = khmer.new_counting_hash(4, 2000, 1)
    counting_hash.consume(seq)
    composition = [counting_hash.get(kmer) for kmer in kmers]
    if norm == True:
        total = sum(composition)
        composition_norm = [
            str(number * 1.0 / total) for number in composition
        ]
        composition = composition_norm
    return composition
Exemple #50
0
def test_median_at_least_single_lt():
    K = 20
    hi = khmer.new_counting_hash(K, 1e6, 2)

    kmers = [
        'ATCGATCGATCGATCGATCG', 'GTACGTACGTACGTACGTAC', 'TTAGTTAGTTAGTTAGTTAG'
    ]

    for kmer in kmers:
        hi.consume(kmer)
        assert hi.median_at_least(kmer, 2) is False
def process_file(filename, HT_SIZE_array):

    N_HT = 4
    K = 12

    list_average_miscount = []
    list_average_miscount_perc = []
    list_fp_miscount0 = []

    print filename
    for HT_SIZE in HT_SIZE_array:
        print HT_SIZE
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
        ht.consume_fasta(filename)

        ktable = khmer.new_ktable(K)
        f = screed.open(filename)
        for record in f:
            sequence = record['sequence']
            ktable.consume(sequence)

        list_miscount = []
        list_miscount_perc = []
        total_kmer = 0  # total number of unique k-mers
        miscount0 = 0

        for i in range(0, ktable.n_entries()):
            n = ktable.get(i)
            if n:
                total_kmer = total_kmer + 1
                kmer2 = ktable.reverse_hash(i)
                miscount = ht.get(kmer2) - ktable.get(kmer2)  ######
                #                if ht.get(kmer2)<ktable.get(kmer2):
                #                    print kmer2,ht.get(kmer2),ktable.get(kmer2)
                miscount_perc = miscount / ktable.get(kmer2)
                list_miscount.append(miscount)
                list_miscount_perc.append(miscount_perc)
                if miscount > 0:
                    miscount0 = miscount0 + 1

        average_miscount = float(sum(list_miscount)) / len(list_miscount)
        list_average_miscount.append(average_miscount)
        average_miscount_perc = float(
            sum(list_miscount_perc)) / len(list_miscount_perc)
        list_average_miscount_perc.append(average_miscount_perc)

        fp_miscount0 = float(miscount0) / total_kmer
        list_fp_miscount0.append(fp_miscount0)

    to_return = [
        list_average_miscount, list_fp_miscount0, total_kmer,
        list_average_miscount_perc
    ]
    return to_return
Exemple #52
0
def test_alignerrorregion():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "AAAAAGTTCGAAAAAGGCACG"
    aligner = khmer.new_readaligner(ch, 1, 20, 11)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACTATTAAAAAAGTTCGAAAAAGGCACGGG")
    graphAlign, readAlign = aligner.align(read)

    assert readAlign == ''
    assert graphAlign == ''
Exemple #53
0
def test_median_at_least_even_lt():
    K = 20
    hi = khmer.new_counting_hash(K, 1e6, 2)

    seqs = [
        'ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC',
        'TTAGTTAGTTAGTTAGTTAGCCC'
    ]

    for seq in seqs:
        hi.consume(seq)
        assert hi.median_at_least(seq, 2) is False
Exemple #54
0
def test_partition_on_abundance_2():
    kh = khmer.new_counting_hash(20, 1e3, 4)
    for i in range(10):
        print kh.consume_and_tag(a)

    for i in range(5):
        print kh.consume_and_tag(b)

    # all paths in 'a'
    p = kh.do_subset_partition_with_abundance(10, 50)
    x = p.count_partitions()
    assert x == (1, 6)  # one partition, six disconnected
def get_composition(seq, kmers, norm):
    """ get the composition profile, add one extra count to avoid 0 count"""
    counting_hash = khmer.new_counting_hash(4, 2000, 1)
    counting_hash.consume(seq)
    composition = [counting_hash.get(kmer) + 1 for kmer in kmers]
    if norm == True:
        total = sum(composition)
        composition_norm = [
            str(number * 1.0 / total) for number in composition
        ]
        composition = composition_norm
    return composition
Exemple #56
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.new_readaligner(ch)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    graphAlign, readAlign = aligner.align(read)

    # should be the same
    assert readAlign == 'ACCTAGGTTCGACATGTACC'
    assert graphAlign == 'ACCTAGGTTCGACATGTACC'
Exemple #57
0
def test_get_badkadian_count():
    countingtable = khmer.new_counting_hash(4, 4**4, 4)
    try:
        countingtable.get_kadian_count()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
    try:
        countingtable.get_kadian_count("AAA")
        assert 0, "this should fail"
    except ValueError as err:
        print str(err)
Exemple #58
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
Exemple #59
0
def test_hashbits_file_type_check():
    kh = khmer.new_counting_hash(12, 1, 1)
    savepath = utils.get_temp_filename('tempcountingsave0.kh')
    kh.save(savepath)

    ht = khmer.new_hashbits(12, 1, 1)

    try:
        ht.load(savepath)
        assert 0, "this should fail"
    except IOError, e:
        print str(e)
Exemple #60
0
def test_find_spectral_error_locs7():
    K = 8
    hi = khmer.new_counting_hash(K, 1e6, 2)

    hi.consume(DNA)
    hi.consume(DNA[K:])

    for n in range(len(DNA) - 8 + 1):
        print(n, hi.get(DNA[n:n + 8]))

    posns = hi.find_spectral_error_positions(DNA, 1)
    assert posns == [7], posns