def main(): p = argparse.ArgumentParser() p.add_argument('A') p.add_argument('B') #p.add_argument('C') p.add_argument('output') p.add_argument('-k', '--ksize', default=31, type=int) args = p.parse_args() # for potato, would suggest changing 1e6 to 4e9. Only matters for A-B hash. a_table = khmer.Nodetable(args.ksize, 10e9, 4) b_table = khmer.Nodetable(args.ksize, 10e9, 4) b_table.consume_seqfile(args.B) n_hashes_loaded = 0 for record in khmer.utils.clean_input_reads(screed.open(args.A)): for hash in a_table.get_kmer_hashes(record.cleaned_seq): if not b_table.get(hash): a_table.add(hash) n_hashes_loaded += 1 print('loaded {} k-mers in A but not in B'.format(n_hashes_loaded)) # at this point, a_table contains A - B out_table = open(args.output, 'w') head = 'sample\ttotal-read\thap-kmer-read\ttotal-kmer\tkmer-hits\n' out_line = '' # one string to rule them all for fastq in fastqGrab(): print('Now querying {}'.format(fastq)) hitcount = 0 # initialize hitcount hashcount = 0 # initialize hashcount out_fp = open(fastq+'.subset', 'w') n_read = 0 n_written = 0 for record in khmer.utils.clean_input_reads(screed.open(fastq)): n_read += 1 match = 0 # adopt O state for no hits in read, change to 1 when match in read for hash in a_table.get_kmer_hashes(record.cleaned_seq): hashcount += 1 if a_table.get(hash): match = 1 # swtich to state "match" hitcount += 1 # tally k-mer hit if match == 1: # write to file if switch tripped khmer.utils.write_record(record, out_fp) n_written += 1 #break print('wrote {} of {} records for {}'.format(n_written, n_read, fastq)) out_line += fastq + '\t'+ str(n_read) + '\t' + str(n_written) + '\t' + str(hashcount) + '\t' + str(hitcount) + '\n' out_table.write(head+out_line)
def __init__(self, query_file, ksize, scaled, catlas_name, debug=True): self.filename = query_file self.ksize = ksize self.kmers = set() self.name = None mh = MinHash(0, ksize, scaled=scaled) self.mh = mh self.catlas_name = catlas_name self.debug = debug notify('----') notify('QUERY FILE: {}', self.filename) # build hashes for all the query k-mers & create signature notify('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) for record in screed.open(self.filename): if self.name is None: self.name = record.name if len(record.sequence) >= int(ksize): self.kmers.update(bf.get_kmer_hashes(record.sequence)) mh.add_sequence(record.sequence, True) self.sig = sourmash.SourmashSignature(mh, name=self.name, filename=self.filename) notify('got {} k-mers from query', len(self.kmers)) self.cdbg_match_counts = {} self.catlas_match_counts = {}
def load_mask(maskfiles, ksize, memory, maxfpr=0.001, savefile=None, logfile=sys.stderr): """Load reference genome and/or contaminant database from a file.""" if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')): mask = kevlar.sketch.load(maskfiles[0]) message = ' nodetable loaded' else: buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4 mask = khmer.Nodetable(ksize, buckets, 4) nr, nk = 0, 0 for maskfile in maskfiles: numreads, numkmers = mask.consume_seqfile(maskfile) nr += numreads nk += numkmers message = ' {:d} sequences and {:d} k-mers consumed'.format(nr, nk) fpr = kevlar.sketch.estimate_fpr(mask) message += '; estimated false positive rate is {:1.3f}'.format(fpr) print(message, file=logfile) if fpr > maxfpr: print('[kevlar::filter] FPR too high, bailing out', file=logfile) sys.exit(1) if savefile: mask.save(savefile) message = ' nodetable saved to "{:s}"'.format(savefile) print(message, file=logfile) return mask
def count_second_pass(infiles, counts, nthreads=1): kevlar.plog('[kevlar::dist] Second pass over the data') tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes()) abund_lists = list() def __do_abund_dist(parser): abund = counts.abundance_distribution(parser, tracking) abund_lists.append(abund) for filename in infiles: kevlar.plog(' -', filename) parser = khmer.ReadParser(filename) threads = list() for _ in range(nthreads): thread = threading.Thread(target=__do_abund_dist, args=(parser, )) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abund_lists) == len(infiles) * nthreads abundance = defaultdict(int) for abund in abund_lists: for i, count in enumerate(abund): if i > 0 and count > 0: abundance[i] += count kevlar.plog('[kevlar::dist] Done second pass over input!') return abundance
def main(argv): p = argparse.ArgumentParser() p.add_argument('catlas_prefix') p.add_argument('-k', '--ksize', default=31, type=int) a = p.parse_args(argv) kh = khmer.Nodetable(a.ksize, 1, 1) contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz') mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf') array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices') def create_records_iter(): print('reading cDBG nodes from {}'.format(contigs_filename)) return screed.open(contigs_filename) x, mphf_to_kmer, mphf_to_cdbg, sizes = build_mphf(kh, create_records_iter) print('done! saving to {} and {}'.format(mphf_filename, array_filename)) x.save(mphf_filename) with open(array_filename, 'wb') as fp: numpy.savez_compressed(fp, mphf_to_kmer=mphf_to_kmer, kmer_to_cdbg=mphf_to_cdbg, sizes=sizes)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('reads') parser.add_argument('-k', '--ksize', type=int, default=31) args = parser.parse_args() if args.reads == '-': args.reads = sys.stdin kh = khmer.Nodetable(args.ksize, 1, 1) print('loading database {}'.format(args.database)) with open(args.database, 'rb') as fp: family_ids, kmer_to_family_id = pickle.load(fp) print('done!') n_same = 0 n_different = 0 n = 0 for record in screed.open(args.reads): n += 1 if n % 1000 == 0: print('...', n) if n > 5000: break hashvals = kh.get_kmer_hashes(record.sequence) if len(hashvals) <= 1: continue first = hashvals[0] last = hashvals[-1] # find the first unambiguously assigned k-mer first_ids = kmer_to_family_id.get(first, set()) idx = 1 while idx < len(hashvals) / 2 and len(first_ids) != 1: first = hashvals[idx] idx += 1 # find the last unambiguously assigned k-mer last_ids = kmer_to_family_id.get(last, set()) idx = len(hashvals) - 2 while idx > len(hashvals) / 2 and len(last_ids) != 1: last = hashvals[idx] idx -= 1 if len(first_ids) == 1 and len(last_ids) == 1 and \ first_ids == last_ids: n_same += 1 else: print('different {} {}'.format(first_ids, last_ids)) n_different += 1 print('same:', n_same) print('different:', n_different)
def test_nodegraph_vs_table(): x = khmer.Nodetable(4, 21, 3) y = khmer.Nodegraph(4, 21, 3) assert hasattr(x, 'add') assert hasattr(y, 'add') assert not hasattr(x, 'consume_and_tag') assert hasattr(y, 'consume_and_tag')
def main(): parser = argparse.ArgumentParser() parser.add_argument("project", help="Project directory", type=str) parser.add_argument("-o", "--output", type=str) parser.add_argument('-k', '--ksize', default=31, type=int) args = parser.parse_args() # figure out catlas and domfile information. catlas_file = os.path.join(args.project, 'catlas.csv') domfile = os.path.join(args.project, 'first_doms.txt') # grab contigs print('loading contigs...') contigs_filename = os.path.join(args.project, 'contigs.fa.gz') contigs = {} for record in screed.open(contigs_filename): contigs[record.name] = record print('done. {} contigs.'.format(len(contigs))) # load catlas DAG catlas = CAtlas(catlas_file, domfile=domfile) print('loaded {} nodes from catlas {}'.format(len(catlas), catlas_file)) print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg))) # create k-mer hashing machinery kh = khmer.Nodetable(args.ksize, 1, 1) # function to yield records as if reading from a file def create_records_iter(): return contigs.values() # BENCHMARK: # build the two indices (kmer to cdbg node, cdbg node ID to pieces) start = time.time() print('building MPHF index.') x, mphf_to_kmer, mphf_to_cdbg, sizes = build_mphf(kh, create_records_iter) print('done! {:.1f}s.'.format(time.time() - start)) print('building index 2 (cdbg node ID to pieces)') cdbg_to_pieces = defaultdict(set) for node_id in catlas: level = catlas.levels[node_id] if level == 1: pieces = catlas.layer1_to_cdbg.get(node_id) for cdbg_node in pieces: cdbg_to_pieces[cdbg_node] = set(pieces) end = time.time() # done. output. outfp = sys.stdout if args.output: outfp = open(args.output, 'at') print("{},{},{:.1f},{},indexPieces".format(len(mphf_to_kmer), len(catlas), end - start, args.project), file=outfp)
def test_validate_with_mask(): kmer = 'AGGGGCGTGACTTAATAAG' mask = khmer.Nodetable(19, 1e3, 2) mask.add(kmer) filelist = kevlar.tests.data_glob('collect.beta.?.txt') readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3) kevlar.filter.validate_and_print(readset, countgraph, mask) assert readset.valid == (3, 24) for record in readset: for ikmer in record.ikmers: assert ikmer.sequence != kmer assert kevlar.revcom(ikmer.sequence) != kmer
def main(): p = argparse.ArgumentParser() p.add_argument('A') p.add_argument('B') p.add_argument('C') p.add_argument('output') p.add_argument('-k', '--ksize', default=31) args = p.parse_args() # for potato, would suggest changing 1e6 to 4e9 a_table = khmer.Nodetable(args.ksize, 1e6, 4) # 1e6 needs to be changed b_table = khmer.Nodetable(args.ksize, 1e6, 4) b_table.consume_seqfile(args.B) n_hashes_loaded = 0 for record in khmer.utils.clean_input_reads(screed.open(args.A)): for hash in a_table.get_kmer_hashes(record.cleaned_seq): if not b_table.get(hash): a_table.add(hash) n_hashes_loaded += 1 print('loaded {} k-mers in A but not in B'.format(n_hashes_loaded)) # at this point, a_table contains A - B out_fp = open(args.output, 'w') n_read = 0 n_written = 0 for record in khmer.utils.clean_input_reads(screed.open(args.C)): n_read += 1 for hash in a_table.get_kmer_hashes(record.cleaned_seq): if a_table.get(hash): khmer.utils.write_record(record, out_fp) n_written += 1 break print('wrote {} of {} records to {}'.format(n_written, n_read, args.output))
def test_consume_with_mask_complement(): mask = khmer.Nodetable(13, 1e3, 4) mask.consume('TGCTTGAAACAAGTG') infile = utils.get_test_data('seq-b.fa') ct = khmer.Counttable(13, 1e3, 4) nr, nk = ct.consume_seqfile_with_mask(infile, mask, threshold=1, consume_masked=True) assert ct.get_kmer_counts('TGCTTGAAACAAGTG') == [1, 1, 1] assert ct.get_kmer_counts('GAAACAAGTGGATTT') == [0, 0, 0]
def main(): p = argparse.ArgumentParser() p.add_argument('readfilelist') args = p.parse_args() filelist = open(args.readfilelist).readlines() filelist = [x.strip() for x in filelist] inputfile = filelist.pop() while not os.path.exists(inputfile): inputfile = filelist.pop() print('starting with', inputfile) remove_queue = [None, None] for pos, filename in enumerate(filelist): if not os.path.exists(filename): print('skipping', filename) continue print('loading kh:', filename) kh = khmer.Nodetable(K, 2e8, 4) kh.consume_seqfile(filename) print('iterating over reads:', inputfile) outputfile = BASE + '.{}'.format(pos) fp = open(outputfile, 'w') n = 0 m = 0 for n, record in enumerate(clean_input_reads(screed.open(inputfile))): if len(record.sequence) < K: continue if kh.median_at_least(record.cleaned_seq, 1): khmer.utils.write_record(record, fp) m += 1 fp.close() print('read {}, wrote {}'.format(n, m)) if inputfile.startswith(BASE): remove_queue.append(inputfile) remove_name = remove_queue.pop(0) if remove_name: print('removing', remove_name) os.unlink(remove_name) inputfile = outputfile print('final file is:', inputfile)
def test_validate_with_mask(): kmer = 'AGGGGCGTGACTTAATAAG' mask = khmer.Nodetable(19, 1e3, 2) mask.add(kmer) filelist = kevlar.tests.data_glob('collect.beta.?.txt') readset = ReadSet(19, 5e3) for record in kevlar.seqio.afxstream(filelist): readset.add(record) readset.validate(mask=mask) assert readset.valid == (3, 24) for record in readset: for ikmer in record.ikmers: assert ikmer.sequence != kmer assert kevlar.revcom(ikmer.sequence) != kmer
def main(): parser = argparse.ArgumentParser() parser.add_argument('extract_from') parser.add_argument('queries', nargs='+') parser.add_argument('-o', '--output') parser.add_argument('-k', '--ksize', default=31, type=int) args = parser.parse_args() assert args.output, "must provide an argument to -o" kh = khmer.Nodetable(args.ksize, 1, 1) query_kmers = set() for queryfile in args.queries: print('loading query', queryfile, file=sys.stderr) for record in screed.open(queryfile): query_kmers.update(kh.get_kmer_hashes(record.sequence)) print('loaded {} k-mers'.format(len(query_kmers)), file=sys.stderr) bp = 0 threshold = REPORTING_BP m = 0 n = 0 bp_out = 0 with open(args.output, 'wt') as fp: print('searching sequences in {}'.format(args.extract_from)) for record in screed.open(args.extract_from): n += 1 bp += len(record.sequence) if bp > threshold: threshold += REPORTING_BP print('... read {} bp in {} sequences'.format(bp, n), file=sys.stderr) print('==> found {} bp in {} sequences'.format(bp_out, m), file=sys.stderr) x = set(kh.get_kmer_hashes(record.sequence)) if x.intersection(query_kmers): fp.write('>{}\n{}\n'.format(record.name, record.sequence)) bp_out += len(record.sequence) m += 1 print('... read {} bp in {} sequences'.format(bp, n), file=sys.stderr) print('==> found {} bp in {} sequences'.format(bp_out, m), file=sys.stderr) return 0
def load_mask(maskfiles, ksize, memory, maxfpr=0.001, savefile=None, logstream=sys.stderr): """Load reference genome and/or contaminant database from a file.""" if maskfiles is None: return None timer = kevlar.Timer() timer.start('loadmask') print('[kevlar::filter] Loading mask from', maskfiles, file=logstream) if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')): mask = kevlar.sketch.load(maskfiles[0]) message = ' nodetable loaded' else: buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4 mask = khmer.Nodetable(ksize, buckets, 4) nr, nk = 0, 0 for maskfile in maskfiles: numreads, numkmers = mask.consume_seqfile(maskfile) nr += numreads nk += numkmers message = ' {:d} sequences and {:d} k-mers consumed'.format(nr, nk) fpr = kevlar.sketch.estimate_fpr(mask) message += '; estimated false positive rate is {:1.3f}'.format(fpr) print(message, file=logstream) if fpr > maxfpr: raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!') if savefile: mask.save(savefile) message = ' nodetable saved to "{:s}"'.format(savefile) print(message, file=logstream) elapsed = timer.stop('loadmask') print('[kevlar::filter]', 'Mask loaded in {:.2f} sec'.format(elapsed), file=logstream) return mask
def __init__(self, query_file, ksize): self.filename = query_file self.ksize = ksize self.kmers = set() self.name = None print('----') print('QUERY FILE:', self.filename) # build hashes for all the query k-mers print('loading query kmers...', end=' ') bf = khmer.Nodetable(ksize, 1, 1) for record in screed.open(self.filename): if self.name is None: self.name = record.name self.kmers.update(bf.get_kmer_hashes(record.sequence)) print('got {}'.format(len(self.kmers))) self.cdbg_match_counts = {} self.catlas_match_counts = {}
def main(): parser = argparse.ArgumentParser() parser.add_argument('transcriptomes', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-o', '--output') args = parser.parse_args() assert args.output kh = khmer.Nodetable(args.ksize, 1, 1) family_ids = {} family_counter = 0 kmer_to_family_id = defaultdict(set) n = 0 for tr_filename in args.transcriptomes: for record in screed.open(tr_filename): n += 1 if n % 1000 == 0: print('...', n) family_name = record.name.split('|')[1] family_id = family_ids.get(family_name) if family_id is None: family_id = family_counter family_counter += 1 family_ids[family_name] = family_id hashvals = kh.get_kmer_hashes(record.sequence) for hashval in hashvals: kmer_to_family_id[hashval].add(family_id) with open(args.output, 'wb') as fp: pickle.dump((family_ids, kmer_to_family_id), fp)
def main(argv): p = argparse.ArgumentParser() p.add_argument('catlas_prefix') p.add_argument('-k', '--ksize', default=31, type=int) a = p.parse_args(argv) kh = khmer.Nodetable(a.ksize, 1, 1) contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz') mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf') array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices') # build a list of all k-mers in the cDBG all_kmers = list() print('reading cDBG nodes from {}'.format(contigs_filename)) for n, record in enumerate(screed.open(contigs_filename)): if n % 50000 == 0 and n: print('... contig', n, end='\r') kmers = kh.get_kmer_hashes(record.sequence) all_kmers.extend(list(kmers)) n_contigs = n + 1 print('loaded {} contigs.\n'.format(n_contigs)) # build MPHF (this is the CPU intensive bit) print('building MPHF for {} k-mers in {} nodes.'.format( len(all_kmers), n_contigs)) x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0) # build tables linking: # * mphf hash to k-mer hash (for checking exactness) # * mphf hash to cDBG ID # * cDBG ID to node size (in k-mers) mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64) mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32) sizes = numpy.zeros(n_contigs, numpy.uint32) print('second pass; reading cDBG nodes from {}'.format(contigs_filename)) for n, record in enumerate(screed.open(contigs_filename)): if n % 50000 == 0 and n: print('... contig {} of {}'.format(n, n_contigs), end='\r') # node ID is record name, must go from 0 to total-1 cdbg_id = int(record.name) # get 64-bit numbers for each k-mer (doesn't really matter what hash) kmers = kh.get_kmer_hashes(record.sequence) # for each k-mer, find its MPHF hashval, & link to info. for kmer in kmers: mphf = x.lookup(kmer) mphf_to_kmer[mphf] = kmer mphf_to_cdbg[mphf] = cdbg_id # record each node size, while we're here. sizes[cdbg_id] = len(kmers) print('loaded {} contigs in pass2.\n'.format(n_contigs)) assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg)) print('done! saving to {} and {}'.format(mphf_filename, array_filename)) x.save(mphf_filename) with open(array_filename, 'wb') as fp: numpy.savez_compressed(fp, mphf_to_kmer=mphf_to_kmer, kmer_to_cdbg=mphf_to_cdbg, sizes=sizes)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('query') p.add_argument('output') p.add_argument('--minsize', type=float, default=100) p.add_argument('--maxsize', type=float, default=10000) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # calculate the cDBG shadow sizes for each catlas node. print('decorating catlas with shadow size info.') node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels) # ...and load cdbg node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix) # decorate catlas with cdbg node sizes underneath them print('decorating catlas with contig size info.') node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) # load k-mer index, query, etc. etc. kmer_idx = search_utils.load_kmer_index(args.catlas_prefix) bf = khmer.Nodetable(args.ksize, 1, 1) query_kmers = set() for record in screed.open(args.query): query_kmers.update(bf.get_kmer_hashes(record.sequence)) print('got {}'.format(len(query_kmers))) # construct dict cdbg_id -> # of query k-mers cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) total_match_kmers = sum(cdbg_match_counts.values()) f_found = total_match_kmers / len(query_kmers) print('=> containment: {:.1f}%'.format(f_found * 100)) print('done loading & counting query k-mers in cDBG.') total_kmers_in_cdbg_matches = 0 for cdbg_id in set(cdbg_match_counts.keys()): total_kmers_in_cdbg_matches += kmer_idx.get_cdbg_size(cdbg_id) cdbg_sim = total_match_kmers / total_kmers_in_cdbg_matches print('cdbg match node similarity: {:.1f}%'.format(cdbg_sim * 100)) cdbg_min_overhead = (total_kmers_in_cdbg_matches - total_match_kmers) / total_match_kmers print('min cdbg overhead: {}'.format(cdbg_min_overhead)) # calculate the cDBG matching k-mers sizes for each catlas node. catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg) ### ok, the real work: look at articulation of cDBG graph. # find highest nodes with kmer size less than given max_size def find_terminal_nodes(node_id, max_size): node_list = set() for sub_id in dag[node_id]: size = node_kmer_sizes[sub_id] if size < max_size: node_list.add(sub_id) else: children = find_terminal_nodes(sub_id, max_size) node_list.update(children) return node_list print('finding terminal nodes for {}.'.format(args.maxsize)) terminal = find_terminal_nodes(top_node_id, args.maxsize) print('...got {}'.format(len(terminal))) terminal = { n for n in terminal if node_kmer_sizes[n] > args.minsize } print('...down to {} between {} and {} in size.'.format(len(terminal), args.minsize, args.maxsize)) # now, go through all nodes and print out characteristics with open(args.output, 'wt') as fp: w = csv.writer(fp) w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'shadow_size']) for n in terminal: f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n] w.writerow([str(n), str(f_contained), str(node_kmer_sizes[n]), str(node_weighted_kmer_sizes[n]), str(node_shadow_sizes[n])])
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') args = parser.parse_args() force_single = args.force_single #if args.reads == '-': # args.reads = sys.stdin # check that input files exist check_valid_file_exists(args.input_filenames) filenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # create object of Nodetable in Khmer to use its kh = khmer.Nodetable(args.ksize, 1, 1) # load database mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list def readFusion(read): global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion flag = None lf_ids = set() rt_ids = set() families = [] shared_kmers = [] gaps = [] hashvals = kh.get_kmer_hashes(read.sequence) # find a matching k-mer at the beginning of the read lf = hashvals[0] lf_ids = get_kmer_to_family_ids(lf) idx = 1 while idx < len(hashvals) and len(lf_ids) == 0: lf = hashvals[idx] lf_ids = get_kmer_to_family_ids(lf) idx += 1 if len(lf_ids) == 0: #print('no single match') n_unmatched += 1 flag = "unmatched" elif idx == len(hashvals): #print('same, only last kmer matched') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # len(lf_ids) > 0 & idx < len(hashvals) # find a matching k-mer at the end of the read rt = hashvals[-1] rt_ids = get_kmer_to_family_ids(rt) idy = len(hashvals) - 2 while idy >= idx and len(rt_ids) == 0: rt = hashvals[idy] rt_ids = get_kmer_to_family_ids(rt) idy -= 1 if len(rt_ids) == 0: #print('same, only one non-last kmer matched ') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: intersect_ids = lf_ids.intersection(rt_ids) if len(intersect_ids) > 0: families.append(intersect_ids) if len(intersect_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # fusion to be resolved shared_kmer = 1 gap_size = 0 gap = False while idx <= idy + 1: temp = hashvals[idx] temp_ids = get_kmer_to_family_ids(temp) if len(temp_ids) > 0: intersect_ids = lf_ids.intersection(temp_ids) if len(intersect_ids) > 0: lf_ids = intersect_ids shared_kmer += 1 gap_size = 0 else: # len(intersect_ids) == 0 families.append(lf_ids) shared_kmers.append(shared_kmer) lf_ids = temp_ids shared_kmer = 1 gaps.append(gap_size) gap_size = 0 else: gap_size += 1 idx += 1 families.append(lf_ids) shared_kmers.append(shared_kmer) assert len(families) > 1 if len(families) == 2: if len(families[0]) == 1 and len(families[1]) == 1: n_clear_fusion += 1 flag = "clear_fusion" else: n_ambig_fusion += 1 flag = "ambig_fusion" else: # len(families) > 2 n_mutli_fusion += 1 flag = "multi_fusion" #if len(families) == 0: # families = "-" #if len(shared_kmers) == 0: # shared_kmers = "-" return flag, families, shared_kmers, gaps fusion_filename = args.database + '_fusion.fa' fusion_fp = open(fusion_filename, 'w') fusionInfo_filename = args.database + '_fusion.info' fusionInfo_fp = open(fusionInfo_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "gene_families", "shared_kmers", "gaps", file=fusionInfo_fp, sep='\t') fusionCalc_filename = args.database + '_fusion.calc' fusionCalc_fp = open(fusionCalc_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "familiy_A", "familiy_B", "no_families", "len_families", "shared_kmers", "gaps", "sorted_keys", file=fusionCalc_fp, sep='\t') fusionPairs_filename = args.database + '_fusionPairs.fa' fusPair_fp = open(fusionPairs_filename, 'w') fusionPairsInfo_filename = args.database + '_fusionPairs.info' fusPairInfo_fp = open(fusionPairsInfo_filename, 'w') print("fileName", "recordIndex", "fusion_class", "R1_family", "R2_family", file=fusPairInfo_fp, sep='\t') fusionPairsCalc_filename = args.database + '_fusionPairs.calc' fusPairCalc_fp = open(fusionPairsCalc_filename, 'w') print("fileName", "recordIndex", "fusion_class", "familiy_A", "familiy_B", "len_families", "sorted_keys", file=fusPairCalc_fp, sep='\t') corrupt_files = [] family_names = dict(zip(family_ids.values(), family_ids.keys())) n = 0 n_paired_fusion = 0 sameRef = ("unique", "ambiguous") fusion = ("clear_fusion", "ambig_fusion", "multi_fusion") for filename, require_paired in files: with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp, fusPair_fp, fusPairInfo_fp, fusPairCalc_fp, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) for r_index, is_paired, read0, read1 in reader: n += 1 if n % 10000 == 0: print('...', n) #if n > 5000: # break flag0, families0, shared_kmers0, gaps0 = readFusion(read0) if not is_paired and flag0 in fusion: #families_names0 = [] #for gp in families0: # gp_names = [] # for family_id in gp: # family_name = family_names[family_id] # gp_names.append(family_name) # families_names0.append(gp_names) print(filename, r_index, "single", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) #i = 1 #while i < len(families0): # for g1 in families0[i-1]: # for g2 in families0[i]: # print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]), # shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t') # i += 1 i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "single", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if is_paired: flag1, families1, shared_kmers1, gaps1 = readFusion(read1) if flag0 in fusion or flag1 in fusion: print(filename, r_index, "Read_1", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) print(filename, r_index, "Read_2", flag1, families1, shared_kmers1, gaps1, file=fusionInfo_fp, sep='\t') write_record(read1, fusion_fp) if flag0 in fusion: i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "Read_1", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if flag1 in fusion: i = len(families1) - 1 for g1 in families1[0]: g1_name = family_names[g1] for g2 in families1[i]: g2_name = family_names[g2] print(filename, r_index, "Read_2", flag1, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families1), [len(f) for f in families1], shared_kmers1, gaps1, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') elif flag0 in sameRef and flag1 in sameRef: if len(families0[0].intersection(families1[0])) == 0: n_paired_fusion += 1 if flag0 == "unique" and flag1 == "unique": fusion_class = "clear_fusion" else: fusion_class = "ambig_fusion" print(filename, r_index, fusion_class, families0, families1, file=fusPairInfo_fp, sep='\t') write_record(read0, fusPair_fp) write_record(read1, fusPair_fp) for g1 in families0[0]: g1_name = family_names[g1] for g2 in families1[0]: g2_name = family_names[g2] print(filename, r_index, fusion_class, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), [ len(f) for f in (families0[0], families1[0]) ], sorted([g1, g2]), file=fusPairCalc_fp, sep='\t') print('No of input fragments: ', n) print('unmatched:', n_unmatched) print('Unique:', n_same) print('Ambiguous:', n_amb_same) print('Single read clear fusion:', n_clear_fusion) print('Single read ambiguous fusion:', n_ambig_fusion) print('Single read multi fusion:', n_mutli_fusion) print('paired read fusion:', n_paired_fusion)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('reads') parser.add_argument('-k', '--ksize', type=int, default=31) args = parser.parse_args() if args.reads == '-': args.reads = sys.stdin kh = khmer.Nodetable(args.ksize, 1, 1) mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list n_same = 0 n_different = 0 n = 0 for record in screed.open(args.reads): n += 1 if n % 1000 == 0: print('...', n) if n > 5000: break hashvals = kh.get_kmer_hashes(record.sequence) if len(hashvals) <= 1: continue first = hashvals[0] last = hashvals[-1] # find the first unambiguously assigned k-mer first_ids = get_kmer_to_family_ids(first) idx = 1 while idx < len(hashvals) / 2 and len(first_ids) != 1: first = hashvals[idx] idx += 1 # find the last unambiguously assigned k-mer last_ids = get_kmer_to_family_ids(last) idx = len(hashvals) - 2 while idx > len(hashvals) / 2 and len(last_ids) != 1: last = hashvals[idx] idx -= 1 if len(first_ids) == 1 and len(last_ids) == 1 and \ first_ids == last_ids: n_same += 1 else: print('different {} {}'.format(first_ids, last_ids)) n_different += 1 print('same:', n_same) print('different:', n_different)
def main(argv=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('queries', nargs='+') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('-o', '--output', type=argparse.FileType('wt')) args = p.parse_args(argv) assert args.output, "must supply -o" contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') x = search_utils.load_cdbg_size_info(args.catlas_prefix) cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = x # load k-mer MPHF index kmer_idx = search_utils.load_kmer_index(args.catlas_prefix) # build hashes for all the query k-mers print('loading query kmers...') bf = khmer.Nodetable(args.ksize, 1, 1) print('queryfile,containment,mean_abundance', file=args.output) for query in args.queries: print('loading', query) query_kmers = set() for record in screed.open(query): query_kmers.update(bf.get_kmer_hashes(record.sequence)) # find the list of cDBG nodes that contain at least one query k-mer cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) # calculate number of nodes found - cdbg_shadow = set(cdbg_match_counts.keys()) # calculate the sum total k-mers across all of the matching nodes cdbg_node_sizes = {} cdbg_total_weighted = 0. for cdbg_id in cdbg_shadow: cdbg_node_sizes[cdbg_id] = kmer_idx.get_cdbg_size(cdbg_id) cdbg_total_weighted += cdbg_weighted_kmer_sizes[cdbg_id] # output some stats total_found = sum(cdbg_match_counts.values()) f_found = total_found / len(query_kmers) print('...done loading & counting query k-mers in cDBG.') print('containment: {:.1f}%'.format(f_found * 100)) weight = cdbg_total_weighted / total_found print('weight:', weight) if f_found < 0.5: print('skipping output for {}; low containment.'.format(query)) continue print('{},{},{}'.format(query, f_found, weight), file=args.output) return 0
#!/usr/bin/env python # A demonstration of using khmer to query a dataset for a k-mer. Typically # khmer accrues a small false positive rate in order to save substantially on # memory requirements. import khmer ksize = 21 target_table_size = 5e8 num_tables = 4 bloomfilter = khmer.Nodetable(ksize, target_table_size, num_tables) bloomfilter.consume('GCTGCACCGATGTACGCAAAGCTATTTAAAACCATAACTATTCTCACTTA') print('count for "GCTGCACCGATGTACGCAAAG" is', bloomfilter.get('GCTGCACCGATGTACGCAAAG')) bloomfilter.count('GCTGCACCGATGTACGCAAAG') print('count for "GCTGCACCGATGTACGCAAAG" is', bloomfilter.get('GCTGCACCGATGTACGCAAAG')) print('count for "GATTACAGATTACAGATTACA" is', bloomfilter.get('GATTACAGATTACAGATTACA'))
def main(): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('query') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('-o', '--output', type=argparse.FileType('wt')) p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args() contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # load k-mer MPHF index kmer_idx = search_utils.load_kmer_index(args.catlas_prefix) # build hashes for all the query k-mers print('loading query kmers...') bf = khmer.Nodetable(args.ksize, 1, 1) x = set() n = 0 query_kmers = set() for record in screed.open(args.query): query_kmers.update(bf.get_kmer_hashes(record.sequence)) # find the list of cDBG nodes that contain at least one query k-mer cdbg_match_counts = kmer_idx.get_match_counts(query_kmers) # calculate number of nodes found - cdbg_shadow = set(cdbg_match_counts.keys()) # calculate the sum total k-mers across all of the matching nodes cdbg_node_sizes = {} for cdbg_id in cdbg_shadow: cdbg_node_sizes[cdbg_id] = kmer_idx.get_cdbg_size(cdbg_id) # output some stats total_found = sum(cdbg_match_counts.values()) f_found = total_found / len(query_kmers) print('...done loading & counting query k-mers in cDBG.') print('containment: {:.1f}%'.format(f_found * 100)) total_kmers_in_cdbg_nodes = sum(cdbg_node_sizes.values()) sim = total_found / total_kmers_in_cdbg_nodes print('similarity: {:.1f}%'.format(sim * 100)) if not args.output: sys.exit(0) # if output requested, extract unitigs. outfp = args.output outname = args.output.name total_bp = 0 total_seqs = 0 print('extracting contigs to {}.'.format(outname)) for n, record in enumerate(screed.open(contigs)): if n % 10000 == 0: offset_f = total_seqs / len(cdbg_shadow) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') contig_id = int(record.name) if contig_id not in cdbg_shadow: continue outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) total_bp += len(record.sequence) total_seqs += 1 print('') print('fetched {} contigs, {} bp matching node list.'.format( total_seqs, total_bp)) sys.exit(0)
def main(argv): p = argparse.ArgumentParser(description=__doc__) p.add_argument('catlas_prefix') p.add_argument('query') p.add_argument('cdbg_nodefile') p.add_argument('-o', '--output', type=argparse.FileType('wt')) p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args(argv) contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') assert args.output, 'must specify -o' outfp = args.output outname = args.output.name print('loading bf...', end=' ') bf = khmer.Nodetable(args.ksize, 3e8, 2) bf.consume_seqfile(args.query) print('done.') print('loading catlas...', end=' ') catlas = CAtlas(args.catlas_prefix) layer1_to_cdbg = catlas.layer1_to_cdbg print('done.') print('loading nodefile {}'.format(args.cdbg_nodefile)) cdbg_nodes = set() with gzip.open(args.cdbg_nodefile, 'r') as fp: for line in fp: cdbg_nodes.add(int(line.strip())) print('loading contigs') total_bp = 0 total_seqs = 0 n_homogeneous = 0 n_missing = 0 bp_missing = 0 for n, record in enumerate(screed.open(contigs)): if n % 10000 == 0: offset_f = total_seqs / len(cdbg_nodes) print('...at n {} ({:.1f}% of shadow)'.format( total_seqs, offset_f * 100), end='\r') contig_id = int(record.name) if contig_id not in cdbg_nodes: continue counts = bf.get_kmer_counts(record.sequence) if min(counts) == max(counts): n_homogeneous += 1 if max(counts) == 0: n_missing += 1 bp_missing += len(record.sequence) outfp.write('{}\n'.format(len(record.sequence))) total_bp += len(record.sequence) total_seqs += 1 print('') print('fetched {} contigs, {} bp matching node list.'.format( total_seqs, total_bp)) print('n_homogeneous: {}'.format(n_homogeneous)) print('pure overhead count: {} seqs / {} bp'.format(n_missing, bp_missing)) return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('unitigs') parser.add_argument('transcriptomes', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-o', '--output') args = parser.parse_args() assert args.output kh = khmer.Nodetable(args.ksize, 1, 1) all_kmers = [] for n, record in enumerate(screed.open(args.unitigs)): if n % 10000 == 0: print('... cdbg', n) if n > 20000 and 0: break all_kmers.extend(kh.get_kmer_hashes(record.sequence)) print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n)) x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0) ### mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64) mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32) for n, record in enumerate(screed.open(args.unitigs)): if n % 10000 == 0: print('... cdbg', n) if n > 20000 and 0: break cdbg_id = int(record.name.split(' ')[0]) kmers = kh.get_kmer_hashes(record.sequence) for kmer in kmers: mphf = x.lookup(kmer) mphf_to_kmer[mphf] = kmer mphf_to_cdbg[mphf] = cdbg_id ### print('walking the transcriptome') family_ids = {} family_counter = 0 cdbg_to_family_id = defaultdict(set) n = 0 for tr_filename in args.transcriptomes: for record in screed.open(tr_filename): n += 1 if n % 1000 == 0: print('...', tr_filename, n) if n > 5000 and 0: break # get the family name family_name = record.name.split('|')[1] # convert to family ID, generating a new one if we need one family_id = family_ids.get(family_name) if family_id is None: family_id = family_counter family_counter += 1 family_ids[family_name] = family_id # for all k-mers, hashvals = kh.get_kmer_hashes(record.sequence) for hashval in hashvals: # find cDBG ID mphf = x.lookup(hashval) if mphf is None: continue assert mphf is not None cdbg_id = mphf_to_cdbg[mphf] # link cDBG ID to family ID cdbg_to_family_id[cdbg_id].add(family_id) mphf_filename = args.output + '.mphf' array_filename = args.output + '.arr' x.save(mphf_filename) with open(array_filename, 'wb') as fp: pickle.dump( (mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id), fp)
def main(argv=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('--query', nargs='+', action='append') p.add_argument('--subtract', nargs='+', action='append') p.add_argument('-o', '--output-suffix') p.add_argument('--threshold', type=float, default=DEFAULT_THRESHOLD) p.add_argument('-k', '--ksize', type=int, default=31) args = p.parse_args(argv) if not args.query: print('error, must specify at least one query with --query') sys.exit(-1) if not args.subtract: print('error, must specify at least one subtract with --subtract') sys.exit(-1) args.query = [item for sublist in args.query for item in sublist] args.subtract = [item for sublist in args.subtract for item in sublist] # construct output filename as {query}.suffix output_suffix = args.output_suffix if not output_suffix: output_suffix = '.donut.fa' # load k-mers to subtract all_kmers = list() kh = khmer.Nodetable(args.ksize, 1, 1) for subtract_fn in args.subtract: print('loading:', subtract_fn) for record in screed.open(subtract_fn): all_kmers.extend(kh.get_kmer_hashes(record.sequence)) # now build a minimal perfect hash function for all those k-mers print('building bbhash table') table = BBHashTable(all_kmers, fill=1) del all_kmers # next, iterate over each input and do subtract for queryfile in args.query: output = os.path.basename(queryfile) + output_suffix print('subtracting from {} -> {}'.format(queryfile, output)) outfp = open(output, 'wt') n = 0 bp = 0 n_kept = 0 bp_kept = 0 for n, record in enumerate(screed.open(queryfile)): if n % 100000 == 0: print('...', queryfile, n, n_kept) bp += len(record.sequence) if len(record.sequence) < args.ksize: continue kmers = kh.get_kmer_hashes(record.sequence) present = 0 for k in kmers: if table[k]: present += 1 f = present / len(kmers) if f < args.threshold: # keep? outfp.write('>{}\n{}\n'.format(record.name, record.sequence)) n_kept += 1 bp_kept += len(record.sequence) print('kept {} ({:.1g} Mbp) of {} ({:.1g} Mbp)'.format( n_kept, bp_kept / 1e6, n, bp / 1e6)) return 0
def main(argv): """\ Query a catlas with a sequence (read, contig, or genome), and retrieve cDBG node IDs and MinHash signatures for the matching unitigs in the graph. """ p = argparse.ArgumentParser(description=main.__doc__) p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--query', help='query sequences', nargs='+') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('--scaled', default=1000, type=float, help="scaled value for contigs minhash output") p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args(argv) outfile = args.output if not args.query: print('must specify at least one query file using --query.') sys.exit(-1) # make sure all of the query sequences exist. for filename in args.query: if not os.path.exists(filename): error('query seq file {} does not exist.', filename) sys.exit(-1) # load catlas DAG catlas = CAtlas(args.catlas_prefix) notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix) notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg)) # find the contigs filename contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # ...and kmer index. ki_start = time.time() kmer_idx = MPHF_KmerIndex.from_catlas_directory(args.catlas_prefix) notify('loaded {} k-mers in index ({:.1f}s)', len(kmer_idx.mphf_to_kmer), time.time() - ki_start) # calculate the k-mer sizes for each catlas node. catlas.decorate_with_index_sizes(kmer_idx) # get a single ksize & scaled ksize = int(args.ksize) scaled = int(args.scaled) records_to_cdbg = {} cdbg_to_records = defaultdict(set) for filename in args.query: print(f"Reading from '{filename}'") for record in screed.open(filename): bf = khmer.Nodetable(ksize, 1, 1) if len(record.sequence) < int(ksize): continue kmers = bf.get_kmer_hashes(record.sequence) cdbg_match_counts = kmer_idx.get_match_counts(kmers) print(f"got {len(cdbg_match_counts)} cdbg nodes for {record.name[:15]} ({len(kmers)} kmers)") dominators = set() for cdbg_node in cdbg_match_counts: dominators.add(catlas.cdbg_to_layer1[cdbg_node]) print(f"got {len(dominators)} dominators for {record.name[:15]}") shadow = catlas.shadow(dominators) print(f"got {len(shadow)} cdbg_nodes under {len(dominators)} dominators") records_to_cdbg[(filename, record.name)] = shadow for cdbg_node in shadow: cdbg_to_records[cdbg_node].add((filename,record.name)) with open(outfile, 'wb') as fp: print(f"saving pickled index to '{outfile}'") pickle.dump((args.catlas_prefix, records_to_cdbg, cdbg_to_records), fp) return 0
def main(): p = argparse.ArgumentParser() p.add_argument('readfilelist') args = p.parse_args() filelist = open(args.readfilelist).readlines() filelist = [x.strip() for x in filelist] inputfile = filelist.pop() while not os.path.exists(inputfile): inputfile = filelist.pop() print('starting with', inputfile) collected = 0 for pos, filename in enumerate(filelist): if not os.path.exists(filename): print('skipping', filename) continue print('loading kh:', filename) kh = khmer.Nodetable(K, 2e8, 4) kh.consume_seqfile(filename) print('iterating over reads:', inputfile) outputfile = BASE + '.{}'.format(pos) fp = open(outputfile, 'w') m = 0 for n, record in enumerate(clean_input_reads(screed.open(inputfile))): if len(record.sequence) < K: continue if kh.median_at_least(record.cleaned_seq, 1): khmer.utils.write_record(record, fp) m += 1 fp.close() print('read {}, wrote {}'.format(n, m)) inputfile = outputfile collected += 1 if collected > 5: break # second round: load results of first round into bloom filter, # use that to sweep reads out of all the files. kh = khmer.Nodetable(K, 2e7, 4) kh.consume_seqfile(inputfile) filelist = open(args.readfilelist).readlines() filelist = [x.strip() for x in filelist] total_read = 0 total_written = 0 for n, filename in enumerate(filelist): print('reading', n, filename) if not os.path.exists(filename): continue fp = open(os.path.basename(filename) + '.collected', 'w') m = 0 for n, record in enumerate(clean_input_reads(screed.open(filename))): if len(record.sequence) < K: continue if kh.median_at_least(record.cleaned_seq, 1): khmer.utils.write_record(record, fp) m += 1 fp.close() print('read {}, wrote {}'.format(n, m)) total_read += n total_written += m print('total so far:', total_read, total_written) print('Results are in *.collected.')
def main(args): # Input and output files outstream = kevlar.open(args.out, 'w') writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::call', refr=args.refr, ) writer.write_header() # Contigs = query sequences contigstream = kevlar.parse_partitioned_reads( kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r'))) contigs_by_partition = load_contigs(contigstream) gdnastream = kevlar.parse_partitioned_reads( kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r'))) mask = None if args.gen_mask: message = 'generating mask of variant-spanning k-mers' kevlar.plog('[kevlar::call]', message) ntables = 4 buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables mask = khmer.Nodetable(args.ksize, buckets, ntables) progress_indicator = kevlar.ProgressIndicator( '[kevlar::call] processed contigs/gDNAs for {counter} partitions', interval=10, breaks=[100, 1000, 10000], ) for partid, gdnas in gdnastream: progress_indicator.update() if partid not in contigs_by_partition: continue contigs = contigs_by_partition[partid] caller = call( gdnas, contigs, partid, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, ksize=args.ksize, refrfile=args.refr, debug=args.debug, mindist=5, homopolyfilt=not args.no_homopoly_filter, maxtargetlen=args.max_target_length, ) for varcall in caller: if args.gen_mask: window = varcall.attribute('ALTWINDOW') if window is not None and len(window) >= args.ksize: mask.consume(window) writer.write(varcall) if args.gen_mask: fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0) if fpr > args.mask_max_fpr: message = 'WARNING: mask FPR is {:.4f}'.format(fpr) message += '; exceeds user-specified limit' message += ' of {:.4f}'.format(args.mask_max_fpr) kevlar.plog('[kevlar::call]', message) mask.save(args.gen_mask)