def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) kh = khmer.Countgraph(1, 1, 1) try: kh.load(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) # set_use_bigcount should still be True after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == 1000
def test_badget_2(): countgraph = khmer.Countgraph(6, 1e6, 2) countgraph.consume(DNA) assert countgraph.get("AGCTTT") == 1 assert countgraph.get("GATGAG") == 0 try: countgraph.get("AGCTT") assert 0, "this should fail" except ValueError as err: print(str(err))
def test_partition_on_abundance_1(): print((a, )) print((b, )) kh = khmer.Countgraph(20, 1e3, 4) for i in range(10): print(kh.consume_and_tag(a)) for i in range(10): print(kh.consume_and_tag(b)) # all paths in 'a' and 'b' p = kh.do_subset_partition_with_abundance(10, 50) x = p.count_partitions() assert x == (1, 0) # one partition, no remainders
def test_align_nothing(): ch = khmer.Countgraph(10, 1048576, 1) read = "ACCAAGGCTCGAGATTTACC" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") score, graphAlign, readAlign, trunc = aligner.align(read) print(score, graphAlign, readAlign) assert trunc assert len(graphAlign) == 0 assert len(readAlign) == 0
def test_bigcount_abund_dist(): kh = khmer.Countgraph(18, 1e2, 4) tracking = khmer.Nodegraph(18, 1e2, 4) kh.set_use_bigcount(True) seqpath = utils.get_test_data('test-abund-read-2.fa') kh.consume_seqfile(seqpath) dist = kh.abundance_distribution(seqpath, tracking) print(kh.get('GGTTGACGGGGCTCAGGG')) pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]] assert dist[1002] == 1, pdist
def test_align_middle(): ch = khmer.Countgraph(10, 1048576, 1) read = "TCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, trusted_cov_cutoff=0, bits_theta=0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume(read) _, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, read) eq_(graphAlign, read) assert not trunc
def test_save_load_large(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.Countgraph(12, 2**31, 1) orig.consume_seqfile(inpath) orig.save(savepath) loaded = Countgraph.load(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def test_readalign(): ch = khmer.Countgraph(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") read = "ACCTAGGTTCGACATGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") score, graphAlign, readAlign, _ = aligner.align(read) eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'AGCTAGGTTCGACAAGTCCT')
def test_find_spectral_error_positions_err(): hi = khmer.Countgraph(8, 1e6, 2) try: hi.find_spectral_error_positions(DNA[:6], 1) assert 0, "should raise ValueError; too short" except ValueError: pass try: hi.find_spectral_error_positions("ACGTACGN", 1) assert 0, "should raise ValueError; contains N" except ValueError: pass
def test_save_load_occupied(ctfile): print('working with', ctfile) inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.Countgraph(12, 1e5, 4) orig.consume_seqfile(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3886, orig_count assert loaded_count == orig_count, loaded_count
def test_count_2(): hi = khmer.Countgraph(12, *ARGS_1m) kmer = 'G' * 12 hashval = hi.hash('G' * 12) assert hi.get(kmer) == 0 assert hi.get(hashval) == 0 hi.count(kmer) assert hi.get(kmer) == 1 assert hi.get(hashval) == 1 hi.count(hashval) # count hashes same as strings assert hi.get(kmer) == 2 assert hi.get(hashval) == 2
def test_consume_absentfasta_with_reads_parser(): countgraph = khmer.Countgraph(4, 4**4, 4) try: countgraph.consume_seqfile_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) countgraph.consume_seqfile_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_badget(): kh = khmer.Countgraph(6, 4**10, 1) DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG" kh.consume(DNA) assert kh.get("AGCTTT") == 1 assert kh.get("GATGAG") == 0 try: kh.get("AGCTT") assert 0, "this should fail" except ValueError as err: print(str(err))
def test_align_fwd_middle_trunc_2(): ch = khmer.Countgraph(10, 1048576, 1) read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT") # omit prefix from graph ch.consume(read[12:]) _, graphAlign, readAlign, trunc, _ = aligner.align_forward(read) # this will fail, because align_forward chooses the first kmer as the # seed. assert not readAlign assert not graphAlign assert trunc
def test_maxcount(): # hashtable should saturate at some point so as not to overflow counter kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(False) last_count = None for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') if c == last_count: break last_count = c assert c != 1000, "should not be able to count to 1000: %d" % c assert c == MAX_COUNT, c # this will depend on HashcountType...
def test_maxcount_with_bigcount(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) last_count = None for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') if c == last_count: break last_count = c assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_maxcount(): # hashtable should saturate at some point so as not to overflow counter kh = khmer.Countgraph(4, 100, 1) last_count = None for _ in range(0, 10000): kh.count('AAAA') c = kh.get('AAAA') print(last_count, c) if c == last_count: break last_count = c assert c != 10000, "should not be able to count to 10000" assert c == MAX_COUNT # this will depend on HashcountType...
def test_median_at_least_comp(): K = 20 C = 4 hi = khmer.Countgraph(K, 1e6, 2) seqs = [ 'ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC', 'TTAGTTAGTTAGTTAGTTAGCCC' ] for seq in seqs: hi.consume(seq) hi.consume(seq) hi.consume(seq) med, _, _ = hi.get_median_count(seq) assert hi.median_at_least(seq, C) is (med >= C)
def test_partition_on_abundance_3(): kh = khmer.Countgraph(20, 1e4, 4) for _ in range(10): print(kh.consume_and_tag(first)) for _ in range(5): print(kh.consume_and_tag(second)) # this will get paths only in 'a' p = kh.do_subset_partition_with_abundance(10, 50) # this will get paths only in 'b' p = kh.do_subset_partition_with_abundance(5, 10) x = p.count_partitions() print(x) assert x == (2, 2) # two partitions, two ignored tags
def test_simple_readalign(): ch = khmer.Countgraph(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 2, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT") read = "ACCTAGGTTCGACAAGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") # CCCGGGCCTTTGTCGAACCTTTTTAAAAGC score, graphAlign, readAlign, trunc = aligner.align(read) # AGCTAGGTTCGACAAGT CCT # ACCTAGGTTCGACAAGTaCC # --CTAGGTTCGACATGT-CC eq_(graphAlign, 'AGCTAGGTTCGACATGTCCT') eq_(readAlign, 'ACCTAGGTTCGACAAGTACC')
def main(): p = argparse.ArgumentParser() p.add_argument('fastq_files', nargs='+') args = p.parse_args() cg = khmer.Countgraph(K, 1e8, 4) kept = 0 hdn = khmer.HashSet(K) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < K: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < K: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 elif cov == 30: contigs = lh.assemble_labeled_path(record.sequence[:K]) for contig in contigs: for t in translate(contig): for o in extract_orfs(t): if hash(o) not in output: output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1
def test_find_all_tags_list_error(): ct = khmer.Countgraph(4, 4**4, 4) # load each sequence but do not build tags - everything should be empty. for record in screed.open(utils.get_test_data('test-graph2.fa')): ct.consume(record.sequence) try: ct.find_all_tags_list("ATA") assert False, "a ValueError should be raised for incorrect k-mer size" except ValueError: pass try: ct.find_all_tags_list("ATAGA") assert False, "a ValueError should be raised for incorrect k-mer size" except ValueError: pass
def test_badfasta_count_kmers_by_position(): countgraph = khmer.Countgraph(4, 4**4, 4) try: countgraph.fasta_count_kmers_by_position() except TypeError as err: print(str(err)) filename = utils.get_test_data("test-short.fa") try: countgraph.fasta_count_kmers_by_position(filename, -1, 0) assert 0, "this should fail" except ValueError as err: print(str(err)) try: countgraph.fasta_count_kmers_by_position(filename, 0, -1) assert 0, "this should fail" except ValueError as err: print(str(err))
def test_fakelump_load_stop_tags_trunc(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') ht = khmer.Nodegraph(32, 1e5, 4) ht.consume_seqfile_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) (n_partitions, _) = ht.count_partitions() assert n_partitions == 1, n_partitions # now, break partitions on any k-mer that you see more than once # on big excursions, where big excursions are excursions 40 out # that encounter more than 82 k-mers. This should specifically # identify our connected sequences in fakelump... EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 counting = khmer.Countgraph(32, 1, 1, primes=[5, 7, 11, 13]) ht.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) ht.save_stop_tags(fakelump_fa_foo) data = open(fakelump_fa_foo, 'rb').read() fp = open(fakelump_fa_foo, 'wb') fp.write(data[:10]) fp.close() # ok, now try loading these stop tags; should fail. ht = khmer.Nodegraph(32, 1, 1, primes=[5, 7, 11, 13]) ht.consume_seqfile_and_tag(fakelump_fa) try: ht.load_stop_tags(fakelump_fa_foo) assert 0, "this test should fail" except OSError: pass
def test_complete_2_collision(): kh = khmer.Countgraph(4, 7, 1) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_rc_filled = 0 # n_fwd_filled = 0 for i in range(0, 128): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 # if kh.get(i): # int hashing is not rc aware # n_fwd_filled += 1 assert n_rc_filled == 128, n_rc_filled
def test_partition_overlap_1(): kh = khmer.Countgraph(20, 1e3, 4) for i in range(10): kh.consume_and_tag(a) for i in range(10): kh.consume_and_tag(b) # this will get paths only in 'a' p1 = kh.do_subset_partition_with_abundance(10, 50) # this will get paths only in 'a', again -- should be the same! p2 = kh.do_subset_partition_with_abundance(10, 50) # p1.report_on_partitions() # p2.report_on_partitions() x = p1.compare_partitions(3, p2, 3) assert x == (0, 0, 14), x
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def get_composition(ksize, seq, kmers, norm): """ get the composition profile and return a list of kmer counts or normalized kmer counts""" try: nkmers = 4**ksize tablesize = nkmers + 100 counting_hash = khmer.Countgraph(ksize, tablesize, 1) counting_hash.consume(seq) composition = [counting_hash.get(kmer) for kmer in kmers] if norm == True: total = sum(composition) nc = [] for item in composition: if item == 0: nc.append(0.0) else: nc.append(float(item) / float(total)) composition = nc return composition except: logging.exception("Could not calculate composition using khmer")
def test_two_components(self, random_sequence, K): comp1 = Sequence(name='Comp1', sequence=random_sequence()) comp2 = Sequence(name='Comp2', sequence=random_sequence(exclude=comp1)) cg = khmer.Countgraph(K, 1e5, 4) ptnr = ConditionalPartitioner(cg) func = PartitionCoverage(coverage_cutoff=5, graph=cg, partitioner=ptnr) for i in range(5): print(ptnr.consume(comp1, func=func)) assert ptnr.n_components == 0 print(ptnr.consume(comp1, func=func)) assert ptnr.n_components == 1 for i in range(5): ptnr.consume(comp2, func=func) assert ptnr.n_components == 1 ptnr.consume(comp2, func=func) assert ptnr.n_components == 2
def count_query_kmers(read_file, kmer_hash, tmpdir, final_outfile): outfh = open(final_outfile, 'wb') for k in kmer_hash.keys(): echo("k = %d" % (k)) current_kmer = kmer_hash[k] ## indexing CMD: python khmerEnv/bin/load-into-counting.py -k 15 -M 1e10 -T 1 -q kmers.graph readfile kmer_graph = tmpdir + "/" + str(k) + "mers.graph" os.system("%s -k %d -M 16G -T 1 -q %s %s" % (khmer_path, k, kmer_graph, read_file)) counts = khmer.Countgraph(k, 100000000000, 1) counts.load(kmer_graph) for mer in current_kmer: mer_count = counts.get(mer) outfh.write("%s\t%d\n" % (mer, mer_count)) outfh.close()