def test_simple_kadian_2(): hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1 hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAGCTATG") hi.consume("ACAGCTATCTCTAGAGCTATG") # --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAcCTATG") hi.consume("ACAGCTATCTCTAGACCTATG") # --^ --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 1, x hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCgCTAGAGCTATG") hi.consume("ACTGCTATCGCTAGAGCTATG") # --^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x
def test_2_kadian(): hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1 hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTATG") hi.consume("ACTGCTATCTCTAGACCTATG") # ---------------^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTAtG") hi.consume("ACTGCTATCTCTAGACCTATG") # ---------------^---^ assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2 hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTACtcCTAtG") hi.consume("ACTGCTATCTCTACTCCTATG") # --------------^^---^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTgTCTCTACtcCTAtG") hi.consume("ACTGCTGTCTCTACTCCTATG") # ------^-------^^---^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 1, x
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K) parser.add_argument('-s', '--sampling-rate', type=int, default=10000) parser.add_argument('-M', '--max-reads', type=int, default=None) parser.add_argument('-m', '--min-kmer-count', type=int, default=2) parser.add_argument('rrna_file') parser.add_argument('reads_file') parser.add_argument('output') args = parser.parse_args() K = args.ksize rrna_file = args.rrna_file reads_file = args.reads_file output = args.output print 'reading', rrna_file ht = khmer.new_counting_hash(K, 1e8) ht.consume_fasta(rrna_file) print 'iterating over kmers' unique_kmers = set() for record in screed.open(rrna_file): seq = record.sequence for i in range(len(seq) - K + 1): kmer = seq[i:i+K] count = ht.get(kmer) if count >= args.min_kmer_count: unique_kmers.add(kmer) print len(unique_kmers), 'unique kmers' ### fp = open(output, 'w') ht = khmer.new_counting_hash(K, 1e10) total_bp = 0 for n, record in enumerate(screed.open(reads_file)): ht.consume(record.sequence) total_bp += len(record.sequence) if n % args.sampling_rate == 0: if args.max_reads and n > args.max_reads: break i = 0 for kmer in unique_kmers: if ht.get(kmer) > 0: i += 1 print '...', n, total_bp, i, float(i) / float(len(unique_kmers)) * 100. print >>fp, n, total_bp, i, float(i) / float(len(unique_kmers)) * 100.
def main(): htReads_filename = sys.argv[1] htExons_filename = sys.argv[2] contig_filename = sys.argv[3] print>>sys.stderr, 'loading ht from', htReads_filename htReads = khmer.new_counting_hash(K, 1, N_HT) htReads.load(htReads_filename) print >> sys.stderr, 'loading ht from', htExons_filename htExons = khmer.new_counting_hash(K, 1, N_HT) htExons.load(htExons_filename) # countExons = htExons.n_entries() # countReads = htReads.n_entries() # print countExons # print countReads print >> sys.stderr, 'Beginning kmer count' for record in screed.open(contig_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') medianCounts = [] sum = 0 for i in range (0, len(seq) - K): a, b, c = htReads.get_median_count(seq[i:i+K]) d, e, f = htExons.get_median_count(seq[i:i+K]) if d < 2: if a > 0: medianCounts.append(a) # medianCounts.append(a / d) if a and d else medianCounts.append(a) if len(medianCounts) > len(seq) / 10: medianCounts.sort() for i in range(0, len(medianCounts)): sum += medianCounts[i] average = sum / len(medianCounts) if len(medianCounts) % 2: median = medianCounts[len(medianCounts) // 2] else: median = float(medianCounts[len(medianCounts) / 2 - 1] + medianCounts[len(medianCounts) / 2]) / 2 else: median = -1 average = -1 print '%s %6.0f %6.0f %1.0f' % (record.name, median, average, len(seq))
def main(): parser = build_common_args() parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parse_args(parser) K=args.ksize HT_SIZE=args.min_hashsize N_HT=args.n_hashes base = args.output_filename filenames = args.input_filenames print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) for n, filename in enumerate(filenames): print 'consuming input', filename ht.consume_fasta(filename) if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) open(base + '.info', 'w').write('through end: %s' % filename)
def test_counting_load_bigcount(): count_table = khmer.new_counting_hash(10, 1e5, 4) count_table.set_use_bigcount(True) for i in range(500): print i, count_table.count('ATATATATAT') count = count_table.get('ATATATATAT') assert count == 500
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = infile + '.abundfilt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, 2) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(filename): global ht basename = os.path.basename(filename) print 'input file to partition: %s' % filename print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print '--' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) ht.consume_fasta(filename) counting = khmer.new_counting_hash(K, COUNTING_SIZE, N_HT) ht.traverse_from_reads(filename, 100, 5000, 5, counting) print 'saving stoptags binary' ht.save_stop_tags(basename + '.stoptags') print 'saving stoptags text' ht.print_stop_tags(basename + '.stoptags.txt') sys.exit(0)
def test_partition_overlap_2(): kh = khmer.new_counting_hash(20, 1e4, 4) for i in range(10): kh.consume_and_tag(a) for i in range(5): kh.consume_and_tag(b) # this will get paths only in 'a' p1 = kh.do_subset_partition_with_abundance(10, 50) # this will get paths only in 'b' p2 = kh.do_subset_partition_with_abundance(5, 10) # p1.report_on_partitions() # p2.report_on_partitions() x = p1.compare_partitions(3, p2, 3) assert x == (8, 6, 0), x x = p1.compare_partitions(3, p2, 5) assert x == (2, 0, 6), x x = p1.partition_sizes() assert x == ([(3, 8)], 0), x x = p2.partition_sizes() assert x == ([(3, 6), (5, 6)], 2), x x = p1.partition_average_coverages(kh) assert x == [(3, 11)] x = p2.partition_average_coverages(kh) assert x == [(3, 5), (5, 10)], x
def test_find_spectral_error_positions_4(): hi = khmer.new_counting_hash(8, 1e6, 2) hi.consume(DNA) posns = hi.find_spectral_error_positions(DNA, 2) assert posns == [], posns
def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def test_get_raw_tables(): ht = khmer.new_counting_hash(20, 1e5, 4) tables = ht.get_raw_tables() for size, table in zip(ht.hashsizes(), tables): assert isinstance(table, buffer) assert size == len(table)
def test_consume_absentfasta(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.consume_fasta("absent_file.fa") assert 0, "This should fail" except IOError, err: print str(err)
def test_get_badkadian_count(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.get_kadian_count() assert 0, "this should fail" except TypeError, err: print str(err)
def test_badhashsizes(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.hashsizes(True) assert 0, "this should fail" except TypeError, err: print str(err)
def test_badsave(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.save() assert 0, "this should fail" except TypeError as err: print str(err)
def main(): ht_filename = sys.argv[1] contig_filename = sys.argv[2] print>>sys.stderr, 'loading ht from', ht_filename ht = khmer.new_counting_hash(K, 1, N_HT) ht.load(ht_filename) partition_counts = {} for record in screed.open(contig_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') a, b, c = ht.get_median_count(seq) partition = record.name.strip().split()[-1] x = partition_counts.get(partition, []) x.append(a) partition_counts[partition] = x for k, x in partition_counts.iteritems(): if len(x) < PARTITION_SIZE_LIMIT: continue fp = open('partition%s.counts' % k, 'w') for i in x: fp.write("%s\n" % i) fp.close()
def test_consume_fasta_and_tag(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.consume_fasta_and_tag() assert 0, "this should fail" except TypeError, err: print str(err)
def test_badconsume_high_abund_kmers(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.consume_high_abund_kmers() assert 0, "this should fail" except TypeError, err: print str(err)
def count(contigs1, contigs2): ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT) count = 0 count2 = 0 count3 = 0 for n in contigs1: if len(n) >= LENGTH_THRESHOLD: kmer1 = slidingWindow(n, K) for x in kmer1: count += 1 if ht.get(x): continue ht.consume(x) for n in contigs2: if len(n) >= LENGTH_THRESHOLD: kmer2 = slidingWindow(n, K) for x in kmer2: count2 += 1 if ht.get(x) > 0: count3 += 1 # 'count' is the total number of kmers in the first file # 'count2' is the total number of kmers in the second file # 'count3' is the total number of kmers shared between the two files. print count, count2, count3, "%.1f%%" % (count3 / float(count) * 100.)
def test_consume_absentfasta_with_reads_parser(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError, err: print str(err)
def main(): parser = build_common_args() parser.add_argument('input_filenames', nargs='+') parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') args = parse_args(parser) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff input_name_list = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in input_name_list: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.replace('N', 'A') med, _, _ = ht.get_median_count(seq) if med < DESIRED_COVERAGE: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of', \ total, 'or', int(100. - discarded / float(total) * 100.), '%' if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(os.path.basename(args.savehash))
def test_trim_full(): hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume(DNA) hi.consume(DNA) seq, pos = hi.trim_on_abundance(DNA, 2) assert DNA == seq, seq
def test_consume_fasta_and_tag(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) try: countingtable.consume_fasta_and_tag() assert 0, "this should fail" except TypeError as err: print str(err) countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
def test_bigcount_overflow(): kh = khmer.new_counting_hash(18, 1e7, 4) kh.set_use_bigcount(True) for i in range(0, 70000): kh.count('GGTTGACGGGGCTCAGGG') assert kh.get('GGTTGACGGGGCTCAGGG') == MAX_BIGCOUNT
def main(): parser = argparse.ArgumentParser( description="Find an initial set of highly connected k-mers.") parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=DEFAULT_COUNTING_HT_N, help='number of counting hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on counting hashsize to use') parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (default 1e4 is prob ok)') parser.add_argument('--stoptags', '-S', dest='stoptags', default='', help="Use stoptags in this file during partitioning") parser.add_argument('graphbase') args = parser.parse_args() graphbase = args.graphbase print 'loading ht %s.ht' % graphbase ht = khmer.load_hashbits(graphbase + '.ht') # do we want to load stop tags, and do they exist? if args.stoptags: print 'loading stoptags from', args.stoptags ht.load_stop_tags(args.stoptags) print 'loading tagset %s.tagset...' % graphbase ht.load_tagset(graphbase + '.tagset') K = ht.ksize() counting = khmer.new_counting_hash(K, args.min_hashsize, args.n_hashes) # divide up into SUBSET_SIZE fragments divvy = ht.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print 'doing pre-partitioning from', start, 'to', end subset = ht.do_subset_partition(start, end) # now, repartition... print 'repartitioning to find HCKs.' ht.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print 'saving stop tags' ht.save_stop_tags(graphbase + '.stoptags')
def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht') hi = khmer.new_counting_hash(12, 1000) try: hi.load(savepath) assert 0, "load should fail" except IOError, e: print str(e)
def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('temphashbitssave0.ht') hi = khmer.new_counting_hash(12, 2) try: hi.load(savepath) assert 0, "load should fail" except IOError: pass
def test_badtrim(): countingtable = khmer.new_counting_hash(6, 1e6, 2) countingtable.consume(DNA) try: countingtable.trim_on_abundance() assert 0, "this should fail" except TypeError, err: print str(err)
def test_find_spectral_error_positions_5(): hi = khmer.new_counting_hash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:10]) hi.consume(DNA[11:]) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [10], posns
def test_bad_use_bigcount(): countingtable = khmer.new_counting_hash(4, 4**4, 4) countingtable.set_use_bigcount(True) assert countingtable.get_use_bigcount() try: countingtable.get_use_bigcount(True) assert 0, "this should fail" except TypeError as err: print str(err)
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.new_counting_hash(4, 4 ** 4, 4) kh.set_use_bigcount(True) for i in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) kh = khmer.new_counting_hash(1, 1, 1) kh.load(savepath) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_median_too_short(): hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume("AAAAAA") try: hi.get_median_count("A") assert 0, "this should fail" except ValueError: pass
def test_bad_use_bigcount(): countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) countingtable.set_use_bigcount(True) assert countingtable.get_use_bigcount() try: countingtable.get_use_bigcount(True) assert 0, "this should fail" except TypeError, err: print str(err)
def test_load_gz_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.new_counting_hash(12, 1000) try: hi.load(savepath) assert 0, "load should fail" except IOError, e: print str(e)
def test_nobigcount_save(): kh = khmer.new_counting_hash(4, 4**4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) kh = khmer.new_counting_hash(1, 1, 1) kh.load(savepath) # set_use_bigcount should still be False after load (i.e. should be saved) assert kh.get('AAAA') == 0 for i in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == MAX_COUNT
def test_counting_gz_file_version_check(): ht = khmer.new_counting_hash(12, 1, 1) inpath = utils.get_test_data('badversion-k12.ct.gz') try: ht.load(inpath) assert 0, "this should fail" except IOError as e: print str(e)
def test_get_raw_tables_view(): ht = khmer.new_counting_hash(20, 1e5, 4) tables = ht.get_raw_tables() for tab in tables: memv = memoryview(tab) assert sum(memv.tolist()) == 0 ht.consume('AAAATTTTCCCCGGGGAAAA') for tab in tables: memv = memoryview(tab) assert sum(memv.tolist()) == 1
def test_counting_file_type_check(): inpath = utils.get_test_data('goodversion-k12.ht') kh = khmer.new_counting_hash(12, 1, 1) try: kh.load(inpath) assert 0, "this should fail" except IOError as e: print str(e)
def test_trim_short(): hi = khmer.new_counting_hash(6, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:50]) seq, pos = hi.trim_on_abundance(DNA, 2) assert DNA[:50] == seq, (seq, pos) assert hi.get(seq[-6:]) == 2 assert hi.get(DNA[:51][-6:]) == 1
def main(): parser = build_construct_args() parser.add_argument('-l', '--lower-cutoff', type=int, dest='lower_cutoff', default=DEFAULT_LOWER_CUTOFF) parser.add_argument('-u', '--upper-cutoff', type=int, dest='upper_cutoff', default=DEFAULT_UPPER_CUTOFF) parser.add_argument('output_filename') parser.add_argument('input_filename') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes output = args.output_filename input = args.input_filename print 'lower cutoff:', args.lower_cutoff print 'upper cutoff:', args.upper_cutoff print 'Saving stoptags to %s' % output print 'Loading sequences in %s' % input ### print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) print 'consuming input', input hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff, args.upper_cutoff) print 'saving stoptags', output hb.save_stop_tags(output)
def count_median(K,HT_SIZE,N_HT,filename,fileout): count = 0 for n, record in enumerate(screed.open(filename)): count = count+1 max_count = count/20 print max_count ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) # seq_array = [] seq_count = 0 median_array = [6,7,8,9,10,11,12,13,14,15] med={} for median in median_array: med[median] = 0 #print med count = 0 for n, record in enumerate(screed.open(filename)): sequence = record['sequence'] ht.consume(sequence) # seq_array.append(sequence) seq_count = seq_count + 1 if seq_count == max_count: count = count+1 number_of_sequence_consumed = max_count*count counted_sequence = 0 #print number_of_sequence_consumed for n2,record2 in enumerate(screed.open(filename)): counted_sequence = counted_sequence+1 sequence2 = record2['sequence'] #print sequence2 #for seq in seq_array: a, b, c = ht.get_median_count(sequence2) #print a,b,c for median in median_array: if a == median: #print "hit!" med[a] = med[a]+1 if counted_sequence == number_of_sequence_consumed: break #print med fileout_obj = open(fileout,'a') print_line = str(number_of_sequence_consumed) for median in median_array: print_line = print_line+ '\t'+str(med[median])+'\t' print_line = print_line+'\n' fileout_obj.write(print_line) fileout_obj.close() seq_count = 0 med={} for median in median_array: med[median] = 0
def test_find_spectral_error_positions_6(): hi = khmer.new_counting_hash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[1:]) for n in range(len(DNA) - 8 + 1): print n, hi.get(DNA[n:n + 8]) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [0], posns
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, count. total = 0 total_unique = 0 for n, record in enumerate(screed.open(filename)): total += 1 last_kmer = record.sequence[-K:] count = ht.get(last_kmer) if count == 1: total_unique += 1 print 'singletons: %d unique; of %d total; %.3f' % \ (total_unique, total, total_unique/float(total))
def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.new_counting_hash(4, 4**4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) kh = khmer.new_counting_hash(1, 1, 1) kh.load(savepath) # set_use_bigcount should still be True after load (i.e. should be saved) assert kh.get('AAAA') == 0 for i in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == 1000
def get_composition(seq, kmers, norm): counting_hash = khmer.new_counting_hash(4, 2000, 1) counting_hash.consume(seq) composition = [counting_hash.get(kmer) for kmer in kmers] if norm == True: total = sum(composition) composition_norm = [ str(number * 1.0 / total) for number in composition ] composition = composition_norm return composition
def test_median_at_least_single_lt(): K = 20 hi = khmer.new_counting_hash(K, 1e6, 2) kmers = [ 'ATCGATCGATCGATCGATCG', 'GTACGTACGTACGTACGTAC', 'TTAGTTAGTTAGTTAGTTAG' ] for kmer in kmers: hi.consume(kmer) assert hi.median_at_least(kmer, 2) is False
def process_file(filename, HT_SIZE_array): N_HT = 4 K = 12 list_average_miscount = [] list_average_miscount_perc = [] list_fp_miscount0 = [] print filename for HT_SIZE in HT_SIZE_array: print HT_SIZE ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.consume_fasta(filename) ktable = khmer.new_ktable(K) f = screed.open(filename) for record in f: sequence = record['sequence'] ktable.consume(sequence) list_miscount = [] list_miscount_perc = [] total_kmer = 0 # total number of unique k-mers miscount0 = 0 for i in range(0, ktable.n_entries()): n = ktable.get(i) if n: total_kmer = total_kmer + 1 kmer2 = ktable.reverse_hash(i) miscount = ht.get(kmer2) - ktable.get(kmer2) ###### # if ht.get(kmer2)<ktable.get(kmer2): # print kmer2,ht.get(kmer2),ktable.get(kmer2) miscount_perc = miscount / ktable.get(kmer2) list_miscount.append(miscount) list_miscount_perc.append(miscount_perc) if miscount > 0: miscount0 = miscount0 + 1 average_miscount = float(sum(list_miscount)) / len(list_miscount) list_average_miscount.append(average_miscount) average_miscount_perc = float( sum(list_miscount_perc)) / len(list_miscount_perc) list_average_miscount_perc.append(average_miscount_perc) fp_miscount0 = float(miscount0) / total_kmer list_fp_miscount0.append(fp_miscount0) to_return = [ list_average_miscount, list_fp_miscount0, total_kmer, list_average_miscount_perc ] return to_return
def test_alignerrorregion(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "AAAAAGTTCGAAAAAGGCACG" aligner = khmer.new_readaligner(ch, 1, 20, 11) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACTATTAAAAAAGTTCGAAAAAGGCACGGG") graphAlign, readAlign = aligner.align(read) assert readAlign == '' assert graphAlign == ''
def test_median_at_least_even_lt(): K = 20 hi = khmer.new_counting_hash(K, 1e6, 2) seqs = [ 'ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC', 'TTAGTTAGTTAGTTAGTTAGCCC' ] for seq in seqs: hi.consume(seq) assert hi.median_at_least(seq, 2) is False
def test_partition_on_abundance_2(): kh = khmer.new_counting_hash(20, 1e3, 4) for i in range(10): print kh.consume_and_tag(a) for i in range(5): print kh.consume_and_tag(b) # all paths in 'a' p = kh.do_subset_partition_with_abundance(10, 50) x = p.count_partitions() assert x == (1, 6) # one partition, six disconnected
def get_composition(seq, kmers, norm): """ get the composition profile, add one extra count to avoid 0 count""" counting_hash = khmer.new_counting_hash(4, 2000, 1) counting_hash.consume(seq) composition = [counting_hash.get(kmer) + 1 for kmer in kmers] if norm == True: total = sum(composition) composition_norm = [ str(number * 1.0 / total) for number in composition ] composition = composition_norm return composition
def test_alignnocov(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.new_readaligner(ch) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") graphAlign, readAlign = aligner.align(read) # should be the same assert readAlign == 'ACCTAGGTTCGACATGTACC' assert graphAlign == 'ACCTAGGTTCGACATGTACC'
def test_get_badkadian_count(): countingtable = khmer.new_counting_hash(4, 4**4, 4) try: countingtable.get_kadian_count() assert 0, "this should fail" except TypeError as err: print str(err) try: countingtable.get_kadian_count("AAA") assert 0, "this should fail" except ValueError as err: print str(err)
def test_alignnocov(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.ReadAligner(ch, 0, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") score, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
def test_hashbits_file_type_check(): kh = khmer.new_counting_hash(12, 1, 1) savepath = utils.get_temp_filename('tempcountingsave0.kh') kh.save(savepath) ht = khmer.new_hashbits(12, 1, 1) try: ht.load(savepath) assert 0, "this should fail" except IOError, e: print str(e)
def test_find_spectral_error_locs7(): K = 8 hi = khmer.new_counting_hash(K, 1e6, 2) hi.consume(DNA) hi.consume(DNA[K:]) for n in range(len(DNA) - 8 + 1): print(n, hi.get(DNA[n:n + 8])) posns = hi.find_spectral_error_positions(DNA, 1) assert posns == [7], posns