def test_complete_no_collision(): kh = khmer._Countgraph(4, [4 ** 4]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == n_entries, n_rc_filled assert n_palindromes == 16, n_palindromes assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \ (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
def explore(ht, start_kmer, K): edges = set() discovered = set() explored = set() hash_ids = {} start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(khmer.reverse_hash(start_kmer_hash, K)): discovered.add(start_kmer_hash) hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1 else: return hash_ids, edges while(len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) hash_ids[neigh_hash] = len(hash_ids.keys()) + 1 edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) elif ht.get(khmer.reverse_hash(neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered): edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) return hash_ids, edges
def explore(ht, start_kmer, K): edges = set() discovered = set() explored = set() hash_ids = {} start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(khmer.reverse_hash(start_kmer_hash, K)): discovered.add(start_kmer_hash) hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1 else: return hash_ids, edges while (len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get( khmer.reverse_hash(neigh_hash, K) ) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) hash_ids[neigh_hash] = len(hash_ids.keys()) + 1 edges.add( tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) elif ht.get(khmer.reverse_hash( neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered): edges.add( tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]]))) return hash_ids, edges
def test_complete_no_collision(): kh = khmer._Countgraph(4, [4**4]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == n_entries, n_rc_filled assert n_palindromes == 16, n_palindromes assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \ (n_fwd_filled, n_entries // 2 + n_palindromes // 2)
def test_reverse_hash(): s = khmer.reverse_hash(0, 4) assert s == "AAAA" s = khmer.reverse_hash(85, 4) assert s == "TTTT" s = khmer.reverse_hash(170, 4) assert s == "CCCC" s = khmer.reverse_hash(255, 4) assert s == "GGGG"
def test_complete_4_collision(): kh = khmer._CountingHash(4, [3]) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) kh.count(s) n_rc_filled = 0 # n_fwd_filled = 0 for i in range(0, 64): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 # if kh.get(i): # int hashing is not rc aware # n_fwd_filled += 1 assert n_rc_filled == 64, n_rc_filled
def test_complete_2_collision(): kh = khmer.new_hashtable(4, 4) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) kh.count(s) n_rc_filled = 0 # n_fwd_filled = 0 for i in range(0, 128): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 # if kh.get(i): # int hashing is not rc aware # n_fwd_filled += 1 assert n_rc_filled == 128, n_rc_filled
def test_reverse_hash_longs(): # test explicitly with long integers, only needed for python2 # the builtin `long` exists in the global scope only global long # pylint: disable=global-variable-undefined if sys.version_info > (3,): long = int s = khmer.reverse_hash(long(0), 4) assert s == "AAAA" s = khmer.reverse_hash(long(85), 4) assert s == "TTTT" s = khmer.reverse_hash(long(170), 4) assert s == "CCCC" s = khmer.reverse_hash(long(255), 4) assert s == "GGGG"
def test_complete_2_collision(): kh = khmer._Countgraph(4, [5]) n_entries = kh.hashsizes()[0] for i in range(0, n_entries): s = khmer.reverse_hash(i, 4) kh.count(s) n_rc_filled = 0 # n_fwd_filled = 0 for i in range(0, 128): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 # if kh.get(i): # int hashing is not rc aware # n_fwd_filled += 1 assert n_rc_filled == 128, n_rc_filled
def get_neighbors(kmer_hash, K): neighbors = [] kmer = khmer.reverse_hash(kmer_hash, K) begin = kmer[0:len(kmer)-1] end = kmer[1:len(kmer)] for base in bases: neighbors.append(khmer.forward_hash(base + begin, K)) neighbors.append(khmer.forward_hash(end + base, K)) return set(neighbors)
def get_neighbors(kmer_hash, K): neighbors = [] kmer = khmer.reverse_hash(kmer_hash, K) begin = kmer[0:len(kmer) - 1] end = kmer[1:len(kmer)] for base in bases: neighbors.append(khmer.forward_hash(base + begin, K)) neighbors.append(khmer.forward_hash(end + base, K)) return set(neighbors)
def test_complete_no_collision(): kh = khmer.new_hashtable(4, 4 ** 2) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == kh.n_entries(), n_rc_filled assert n_palindromes == 16, n_palindromes # @CTB check this assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, n_fwd_filled
def test_complete_no_collision(): kh = khmer.new_hashtable(4, 4**2) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == kh.n_entries(), n_rc_filled assert n_palindromes == 16, n_palindromes # @CTB check this assert n_fwd_filled == kh.n_entries() / 2 + n_palindromes / 2, \ n_fwd_filled
def explore(ht, start_kmer, K): discovered = set() explored = set() start_kmer_hash = khmer.forward_hash(start_kmer, K) if ht.get(kmer): discovered.add(start_kmer_hash) else: return 0 while(len(discovered) > 0 and (len(explored) < 2000000)): kmer_hash = discovered.pop() kmer_neighbors = get_neighbors(kmer_hash, K) explored.add(kmer_hash) for neigh_hash in kmer_neighbors: if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered: discovered.add(neigh_hash) return len(explored)
def get_all_kmers(ht, start_kmer, K, ht2, degs): q = list() start_kmer_hash = khmer.forward_hash(start_kmer, K) if not ht2.get(start_kmer_hash): ht2.count(start_kmer) else: return ht2, degs neighs = find_neighbors(start_kmer, ht) degs = add_deg(degs, len(neighs)) for neigh in neighs: neigh_hash = khmer.forward_hash(neigh, K) if not ht2.get(neigh): q.append(neigh_hash) ht2.count(neigh) counter = 0 while len(q) != 0: counter += 1 kmer_hash = q.pop() kmer = khmer.reverse_hash(kmer_hash, K) neighs = find_neighbors(kmer, ht) degs = add_deg(degs, len(neighs)) for neigh in neighs: neigh_hash = khmer.forward_hash(neigh, K) if not ht2.get(neigh): q.append(neigh_hash) ht2.count(neigh)
def test_contains_2(): hs = khmer.HashSet(5, [8, 10]) assert khmer.reverse_hash(8, 5) in hs assert khmer.reverse_hash(10, 5) in hs assert khmer.reverse_hash(2**35, 5) not in hs
def run(args): # @CTB this is kind of a hack - nothing tricky going on, just want to # specify memory on the command line rather than graph size... graph_tablesize = int(args.memory * 8.0 / 4.0) assert args.ksize % 2, "ksize must be odd" if args.label: label_list = [] output_dir = args.output if not output_dir: if len(args.seqfiles) > 1: print('** please specify an output directory with -o', file=sys.stderr) sys.exit(-1) output_dir = os.path.basename(args.seqfiles[0]) if output_dir.endswith('.fa'): output_dir = output_dir[:-3] elif output_dir.endswith('.fa.gz'): output_dir = output_dir[:-6] # set this so we can read it for logging args.output = output_dir # gxtfile = os.path.basename(output_dir) + '.gxt' gxtfile = os.path.join(output_dir, "cdbg.gxt") contigfile = os.path.join(output_dir, "contigs.fa.gz") print('') print('placing output in directory:', output_dir) print('gxt will be:', gxtfile) try: os.mkdir(output_dir) except FileExistsError: print('(note: directory already exists)') print('') if args.loadgraph: print('loading nodegraph from:', args.loadgraph) graph = khmer.Nodegraph.load(args.loadgraph) print('creating accompanying stopgraph') ksize = graph.ksize() hashsizes = graph.hashsizes() stop_bf = khmer.Nodegraph(ksize, 1, 1, primes=hashsizes) else: print('building graphs and loading files') # Create graph and a stop bloom filter - one for loading, one for # traversing. Create them all here so that we can error out quickly # if memory is a problem. # @CTB note that hardcoding '2' here is not nec a great idea. graph = khmer.Nodegraph(args.ksize, graph_tablesize, 2) stop_bf = khmer.Nodegraph(args.ksize, graph_tablesize, 2) n = 0 # load in all of the input sequences, one file at a time. for seqfile in args.seqfiles: fp = screed.open(seqfile) for record in khmer.utils.clean_input_reads(fp): if len(record.cleaned_seq) < graph.ksize(): continue n += 1 if n % 100000 == 0: print('...', seqfile, n) graph.consume(record.cleaned_seq) fp.close() # complain if too small set of graphs was used. fp_rate = khmer.calc_expected_collisions(graph, args.force, max_false_pos=.05) ksize = graph.ksize() # initialize the object that will track information for us. pathy = Pathfinder(ksize, gxtfile, contigfile, not args.no_assemble) print('finding high degree nodes') if args.label: print('(and labeling them, per request)') degree_nodes = khmer.HashSet(ksize) linear_starts = khmer.HashSet(ksize) n = 0 skipped = 0 for seqfile in args.seqfiles: fp = screed.open(seqfile) for record in khmer.utils.clean_input_reads(fp): if len(record.cleaned_seq) < ksize: skipped += 1 continue n += 1 if n % 100000 == 0: print('...2', seqfile, n) # walk across sequences, find all high degree nodes, # name them and cherish them. these_hdn = graph.find_high_degree_nodes(record.cleaned_seq) if these_hdn: degree_nodes += these_hdn else: # possible linear node? check first and last k-mer. # (the logic here is that every purely linear node must # start or end in *some* record.sequence - so where we have # record sequences that have only 1 neighbor, those will be # all possible linear nodes. first_kmer = record.sequence[:ksize] last_kmer = record.sequence[-ksize:] assert len(last_kmer) == ksize if len(graph.neighbors(first_kmer)) == 1: linear_starts.add(graph.hash(first_kmer)) if len(graph.neighbors(last_kmer)) == 1: linear_starts.add(graph.hash(last_kmer)) if args.label: label_list.append(record.name) for kmer in these_hdn: pathy.add_label(kmer, n) fp.close() print('read {}, skipped {} for being too short'.format(n, skipped)) # get all of the degree > 2 kmers and give them IDs. for kmer in degree_nodes: pathy.new_hdn(kmer) stop_bf.add(kmer) print('traversing linear segments from', len(degree_nodes), 'nodes') # now traverse from each high degree node into all neighboring nodes, # seeking adjacencies. if neighbor is high degree node, add it to # adjacencies; if neighbor is not, then traverse the linear path & # assemble if desired. for n, k in enumerate(degree_nodes): if n % 10000 == 0: print('...', n, 'of', len(degree_nodes)) # retrieve the node ID of the primary segment. k_id = pathy.kmers_to_nodes[k] # here is where we would output this k-mer to the contig file if we # wanted to. nk_id = pathy.kmers_to_nodes[k] k_str = khmer.reverse_hash(k, ksize) pathy.add_assembly(nk_id, k_str) # find all the neighbors of this high-degree node. nbh = graph.neighbors(k) for nk in nbh: # neighbor is high degree? fine, mark its adjacencies. if nk in degree_nodes: nk_id = pathy.kmers_to_nodes[nk.kmer_u] pathy.add_adjacency(k_id, nk_id) else: # linear! walk it. traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy, degree_nodes) # now, clean up at the end -- make sure we've hit all the possible # linear nodes. print('traversing from {} potential linear starts'.format( len(linear_starts))) for n, k in enumerate(linear_starts): traverse_and_mark_linear_paths(graph, k, stop_bf, pathy, degree_nodes) print('{} linear segments and {} high-degree nodes'.\ format(pathy.node_counter, len(pathy.nodes))) del graph del stop_bf # save to GXT. print('saving gxtfile', gxtfile) all_labels = set() label_counts = {} pathy.adjfp.close() adj_fp = open(gxtfile + '.adj', 'rt') # this uniqifies the edges... for line in adj_fp: a, b = line.split(',') a = int(a) b = int(b) pathy.adjacencies[a].add(b) adj_fp.close() try: os.unlink(gxtfile + '.adj') except: print('cannot remove', gxtfile + '.adj') # ...and now print them out. edges = [] for k, v in pathy.adjacencies.items(): for dest in v: # don't add loops if (k != dest): edges.append((k, dest)) with open(gxtfile, 'wt') as fp: write(fp, pathy.node_counter, edges) if not args.no_assemble: pathy.assemblyfp.close() if args.label: print('note: used/assigned %d labels total' % (len(set(all_labels)), )) print('counts:', label_counts) assert label_list print('dumping label list now.') label_file = os.path.basename(output_dir) + '.labels.txt' label_file = os.path.join(output_dir, label_file) with open(label_file, "wt") as fp: for n, label in enumerate(label_list): fp.write("{} {}\n".format(n + 0, label))
def test_reverse_hash_raises(): with pytest.raises(TypeError) as excinfo: khmer.reverse_hash('2345', 4) assert 'int' in str(excinfo.value)