def test_concat_2_fail(): hs = khmer.HashSet(5, [10, 12]) hs2 = khmer.HashSet(4, [10, 13]) try: hs += hs2 assert 0, "inplace concat should fail - different ksize" except ValueError: pass
def test_bad_construct(): try: hs = khmer.HashSet() assert 0, "HashSet constructor should fail w/o argument" except TypeError: pass try: hs = khmer.HashSet(5, [{}]) assert 0, "HashSet constructor should fail w/o list of k-mers" except ValueError: pass
def test_update_bad(): hs = khmer.HashSet(5) x = [5, 10, 15, 2**35, {}] try: hs.update(x) assert 0, "cannot add dict to a HashSet" except ValueError: pass
def test_remove_2(): hs = khmer.HashSet(5, [8, 10]) assert len(hs) == 2 try: hs.remove(15) assert 0, "hs.remove should raise an Exception" except ValueError: pass assert len(hs) == 2 assert list(sorted(hs)) == [8, 10]
def test_traverse_linear_path(): contigfile = utils.get_test_data('simple-genome.fa') contig = list(screed.open(contigfile))[0].sequence K = 21 nodegraph = khmer.Nodegraph(K, 1e5, 4) stopgraph = khmer.Nodegraph(K, 1e5, 4) nodegraph.consume(contig) degree_nodes = khmer.HashSet(K) size, conns, visited = nodegraph.traverse_linear_path( contig[:K], degree_nodes, stopgraph) assert size == 980 assert len(conns) == 0 assert len(visited) == 980
def main(): p = argparse.ArgumentParser() p.add_argument('fastq_files', nargs='+') args = p.parse_args() cg = khmer.Countgraph(K, 1e8, 4) kept = 0 hdn = khmer.HashSet(K) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < K: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < K: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 elif cov == 30: contigs = lh.assemble_labeled_path(record.sequence[:K]) for contig in contigs: for t in translate(contig): for o in extract_orfs(t): if hash(o) not in output: output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1
def main(): p = argparse.ArgumentParser() p.add_argument('contig_files', nargs='+') args = p.parse_args() ng = khmer.Nodegraph(K, 1e8, 4) starts = [] for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) ng.consume(record.sequence) starts.append(record.sequence[:K]) hdn = khmer.HashSet(K) for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) hdn += ng.find_high_degree_nodes(record.sequence) lh = khmer._GraphLabels(ng) for filename in args.contig_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n) lh.label_across_high_degree_nodes(record.sequence, hdn, n) counter = 0 for k in starts: contigs = lh.assemble_labeled_path(k) if not contigs: print('nada...') for c in contigs: print('>%d\n%s' % (counter, c)) counter += 1
def test_update(): hs = khmer.HashSet(5) x = [5, 10, 15, 2**35] hs.update(x) assert list(sorted(hs)) == [5, 10, 15, 2**35]
def test_add(): hs = khmer.HashSet(5) hs.add(7) hs.add(4) assert list(sorted(hs)) == [4, 7]
def test_iter_single(): hs = khmer.HashSet(5, [6]) k = iter(hs) k2 = iter(k) assert k == k2
def run(args): # @CTB this is kind of a hack - nothing tricky going on, just want to # specify memory on the command line rather than graph size... graph_tablesize = int(args.memory * 8.0 / 4.0) assert args.ksize % 2, "ksize must be odd" if args.label: label_list = [] output_dir = args.output if not output_dir: if len(args.seqfiles) > 1: print('** please specify an output directory with -o', file=sys.stderr) sys.exit(-1) output_dir = os.path.basename(args.seqfiles[0]) if output_dir.endswith('.fa'): output_dir = output_dir[:-3] elif output_dir.endswith('.fa.gz'): output_dir = output_dir[:-6] # set this so we can read it for logging args.output = output_dir # gxtfile = os.path.basename(output_dir) + '.gxt' gxtfile = os.path.join(output_dir, "cdbg.gxt") contigfile = os.path.join(output_dir, "contigs.fa.gz") print('') print('placing output in directory:', output_dir) print('gxt will be:', gxtfile) try: os.mkdir(output_dir) except FileExistsError: print('(note: directory already exists)') print('') if args.loadgraph: print('loading nodegraph from:', args.loadgraph) graph = khmer.Nodegraph.load(args.loadgraph) print('creating accompanying stopgraph') ksize = graph.ksize() hashsizes = graph.hashsizes() stop_bf = khmer.Nodegraph(ksize, 1, 1, primes=hashsizes) else: print('building graphs and loading files') # Create graph and a stop bloom filter - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. # @CTB note that hardcoding '2' here is not nec a great idea. graph = khmer.Nodegraph(args.ksize, graph_tablesize, 2) stop_bf = khmer.Nodegraph(args.ksize, graph_tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: fp = screed.open(seqfile) for record in khmer.utils.clean_input_reads(fp): if len(record.cleaned_seq) < graph.ksize(): continue n += 1 if n % 100000 == 0: print('...', seqfile, n) graph.consume(record.cleaned_seq) fp.close() # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) ksize = graph.ksize() # initialize the object that will track information for us. pathy = Pathfinder(ksize, gxtfile, contigfile, not args.no_assemble) print('finding high degree nodes') if args.label: print('(and labeling them, per request)') degree_nodes = khmer.HashSet(ksize) linear_starts = khmer.HashSet(ksize) n = 0 skipped = 0 for seqfile in args.seqfiles: fp = screed.open(seqfile) for record in khmer.utils.clean_input_reads(fp): if len(record.cleaned_seq) < ksize: skipped += 1 continue n += 1 if n % 100000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. these_hdn = graph.find_high_degree_nodes(record.cleaned_seq) if these_hdn: degree_nodes += these_hdn else: # possible linear node? check first and last k-mer. # (the logic here is that every purely linear node must # start or end in *some* record.sequence - so where we have # record sequences that have only 1 neighbor, those will be # all possible linear nodes. first_kmer = record.sequence[:ksize] last_kmer = record.sequence[-ksize:] assert len(last_kmer) == ksize if len(graph.neighbors(first_kmer)) == 1: linear_starts.add(graph.hash(first_kmer)) if len(graph.neighbors(last_kmer)) == 1: linear_starts.add(graph.hash(last_kmer)) if args.label: label_list.append(record.name) for kmer in these_hdn: pathy.add_label(kmer, n) fp.close() print('read {}, skipped {} for being too short'.format(n, skipped)) # get all of the degree > 2 kmers and give them IDs. for kmer in degree_nodes: pathy.new_hdn(kmer) stop_bf.add(kmer) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree node into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path & # assemble if desired. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the node ID of the primary segment. k_id = pathy.kmers_to_nodes[k] # here is where we would output this k-mer to the contig file if we # wanted to. nk_id = pathy.kmers_to_nodes[k] k_str = khmer.reverse_hash(k, ksize) pathy.add_assembly(nk_id, k_str) # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.kmers_to_nodes[nk.kmer_u] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes) # now, clean up at the end -- make sure we've hit all the possible # linear nodes. print('traversing from {} potential linear starts'.format( len(linear_starts))) for n, k in enumerate(linear_starts): traverse_and_mark_linear_paths(graph, k, stop_bf, pathy, degree_nodes) print('{} linear segments and {} high-degree nodes'.\ format(pathy.node_counter, len(pathy.nodes))) del graph del stop_bf # save to GXT. print('saving gxtfile', gxtfile) all_labels = set() label_counts = {} pathy.adjfp.close() adj_fp = open(gxtfile + '.adj', 'rt') # this uniqifies the edges... for line in adj_fp: a, b = line.split(',') a = int(a) b = int(b) pathy.adjacencies[a].add(b) adj_fp.close() try: os.unlink(gxtfile + '.adj') except: print('cannot remove', gxtfile + '.adj') # ...and now print them out. edges = [] for k, v in pathy.adjacencies.items(): for dest in v: # don't add loops if (k != dest): edges.append((k, dest)) with open(gxtfile, 'wt') as fp: write(fp, pathy.node_counter, edges) if not args.no_assemble: pathy.assemblyfp.close() if args.label: print('note: used/assigned %d labels total' % (len(set(all_labels)), )) print('counts:', label_counts) assert label_list print('dumping label list now.') label_file = os.path.basename(output_dir) + '.labels.txt' label_file = os.path.join(output_dir, label_file) with open(label_file, "wt") as fp: for n, label in enumerate(label_list): fp.write("{} {}\n".format(n + 0, label))
def test_concat_2(): hs = khmer.HashSet(5, [10, 12]) hs2 = khmer.HashSet(5, [10, 13]) hs += hs2 assert list(sorted(hs)) == [10, 12, 13]
def test_contains_1(): hs = khmer.HashSet(5, [8, 10]) assert 8 in hs assert 10 in hs assert 2**35 not in hs
def test_remove(): hs = khmer.HashSet(5, [8, 10]) assert len(hs) == 2 hs.remove(8) assert len(hs) == 1 assert list(hs) == [10]
def test_iter_single(): hs = khmer.HashSet(5, [6]) for k in hs: assert k == 6 print(k)
def main(): parser = argparse.ArgumentParser() parser.add_argument('seqfiles', nargs='+') parser.add_argument('-o', '--output', default=None) parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int) parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE, type=float) parser.add_argument('--force', action='store_true') args = parser.parse_args() assert args.ksize % 2, "ksize must be odd" assert args.output, "you probably want an output file" print('building graphs and loading files') # Create graph, and two stop bloom filters - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. graph = khmer.Nodegraph(args.ksize, args.tablesize, 2) print(graph.ksize(), graph.hashsizes()) stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2) stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...', seqfile, n) graph.consume(record.sequence) # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) # initialize the object that will track information for us. pathy = Pathfinder(args.ksize) print('finding high degree nodes') degree_nodes = khmer.HashSet(args.ksize) n = 0 for seqfile in args.seqfiles: for record in screed.open(seqfile): n += 1 if n % 10000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. Don't do this on identical sequences. if min(stop_bf2.get_kmer_counts(record.sequence)) == 0: stop_bf2.consume(record.sequence) degree_nodes += graph.find_high_degree_nodes(record.sequence) del stop_bf2 if not len(degree_nodes): print('no high degree nodes; exiting.') sys.exit(0) # get all of the degree > 2 nodes and give them IDs. for node in degree_nodes: pathy.new_segment(node) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree nodes into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path. also # track minhashes while we're at it. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the segment ID of the primary node. k_id = pathy.segments_r[k] # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.segments_r[nk] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes) print(len(pathy.segments), 'segments, containing', sum(pathy.segments.values()), 'nodes') # save to GML if args.output: print('saving to', args.output) fp = open(args.output, 'w') w = GmlWriter(fp, [], []) for k, v in pathy.segments.items(): w.add_vertex(k, v, []) for k, v in pathy.adjacencies.items(): for edge in v: w.add_edge(k, edge, [])
def test_iter_double(): x = [6, 9, 20] hs = khmer.HashSet(5, x) for i, k in enumerate(hs): assert k == x[i], (k, x[i])
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=[ 'read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new' ]) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None }) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({ 'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None }) elif cov == 30: contigs = lh.assemble_labeled_path( record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None }) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new })
def test_contains_2(): hs = khmer.HashSet(5, [8, 10]) assert khmer.reverse_hash(8, 5) in hs assert khmer.reverse_hash(10, 5) in hs assert khmer.reverse_hash(2**35, 5) not in hs