ref_file_names_to_loc[base_name] = line fid.close() # get the locations of the reference sequences to build out_fid = open(out_file, 'w') ref_loc_to_build = [] for name in names_passed_thresh: ref_loc_to_build.append( ref_file_names_to_loc[name]) # if using the bz2 extension #ref_loc_to_build.append(ref_file_names_to_loc[os.path.splitext(name)[0]]) # since bbmap stuff changed the file extension # This uses khmer to merge the contigs and put them in one fasta file for loc in ref_loc_to_build: print(os.path.basename(loc)) fid = khmer.ReadParser(loc) seq = "" i = 0 for record in fid: if i == 0: header = record.name seq += "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" seq += record.sequence i += 1 print("there") record = Record(name=header, sequence=seq) write_record(record, out_fid) fid.close() out_fid.close() # This relies on using bbmap to do the contig merging, and then will use cat to concatenate them
def test_read_bundler_empty_file(): infile = utils.get_test_data('empty-file') with pytest.raises(OSError): records = [r for r in khmer.ReadParser(infile)]
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) report_on_config(args) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile, file=sys.stderr) if args.savetable: print('Saving k-mer counting table filename', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) print('wrote to: ', outfile, file=sys.stderr)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print 'consuming input', filename for _ in xrange(args.n_threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'mid-save', base htable.save(base) open(base + '.info', 'w').write('through %s' % filename) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print 'kmer_size:', counting_hash.ksize() print 'k-mer counting table sizes:', counting_hash.hashsizes() print 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 1 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print 'preparing hist from %s...' % args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 2 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print 'Saving k-mer counting table ', args.savetable print '...saving to', args.savetable counting_hash.save(args.savetable)
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) # print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() if n > 0 and n % 10 == 0: print 'mid-save', base ht.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base ht.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1) print 'DONE.'
def verbose_loader(filename): """Read iterator that additionally prints progress info to stderr.""" for num, record in enumerate(khmer.ReadParser(filename)): if num % 100000 == 0: log_info('... filtering {num}', num=num) yield record
def main(): parser = build_construct_args() add_threading_args(parser) parser.add_argument('--no-build-tagset', '-n', default=False, action='store_true', dest='no_build_tagset', help='Do NOT construct tagset while loading sequences') parser.add_argument('output_filename') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) print 'Saving hashtable to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' # print 'making hashtable' ht = khmer.new_hashbits(K, HT_SIZE, N_HT) if args.no_build_tagset: target_method = ht.consume_fasta_with_reads_parser else: target_method = ht.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for n, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for tnum in xrange(n_threads): t = threading.Thread(target=target_method, args=(rparser, )) threads.append(t) t.start() for t in threads: t.join() print 'saving hashtable in', base + '.ht' ht.save(base + '.ht') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' ht.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % ht.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the graph structure is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" sys.exit(-1)
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): parser = argparse.ArgumentParser( description= "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-i', '--intersect_nodegraph', help= "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space (unfortunately will also disable threading)." ) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument('out_dir', help='Output directory') # Parse and check args args = parser.parse_args() query_file = os.path.abspath(args.in_file) ksize = args.k_size num_threads = args.threads node_graph_out = os.path.join( os.path.abspath(args.out_dir), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if args.intersect_nodegraph is not None: intersect_nodegraph_file = args.intersect_nodegraph else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) fprate = args.fp_rate hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Doesnt work due to khmer bug if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out)
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, 1) print 'consuming input', filename target_method(rparser) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, count=True, smallcount=False, mask=None, maskmaxabund=0, consume_masked=False, numbands=None, band=None, outfile=None, numthreads=1): """Compute k-mer abundances for the specified sequence input. Expected input is a list of one or more FASTA/FASTQ files corresponding to a single sample. A sketch is created and populated with abundances of all k-mers observed in the input. If `mask` is provided, only k-mers not present in the mask will be loaded. """ numtables = 4 sketchtype = 'nodegraph' if count: sketchtype = 'smallcountgraph' if smallcount else 'countgraph' tablesize = memory / numtables * khmer._buckets_per_byte[sketchtype] sketch = allocate(ksize, tablesize, num_tables=numtables, count=count, smallcount=smallcount) numreads = 0 for seqfile in seqfiles: message = '- processing "{}"'.format(seqfile) kevlar.plog('[kevlar::count]', message) parser = khmer.ReadParser(seqfile) threads = list() for _ in range(numthreads): if mask: threshold = 1 if consume_masked else maskmaxabund kwargs = { 'consume_masked': consume_masked, 'threshold': threshold } if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding_with_mask, args=( parser, numbands, band, mask, ), kwargs=kwargs, ) else: thread = threading.Thread( target=sketch.consume_seqfile_with_mask, args=( parser, mask, ), kwargs=kwargs, ) else: if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding, args=( parser, numbands, band, ), ) else: thread = threading.Thread( target=sketch.consume_seqfile, args=(parser, ), ) threads.append(thread) thread.start() for thread in threads: thread.join() numreads += parser.num_reads message = 'Done loading k-mers' if numbands: message += ' (band {:d}/{:d})'.format(band + 1, numbands) fpr = kevlar.sketch.estimate_fpr(sketch) message += ';\n {:d} reads processed'.format(numreads) message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers()) message += ';\n estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' message = '[kevlar::count] ' + message raise kevlar.sketch.KevlarUnsuitableFPRError(message) if outfile: extensions = get_extension(count=count, smallcount=smallcount) if not outfile.endswith(extensions): outfile += extensions[1] sketch.save(outfile) message += ';\n saved to "{:s}"'.format(outfile) kevlar.plog('[kevlar::count]', message) return sketch
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' configure_logging(args.quiet) report_on_config(args, graph_type) check_input_files(args.input_sequence_filename, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, graph_type) check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') # In case the user specified a maximum memory usage, use 8/(9+eps) of that # for the countgraph and 1/(9+eps) for the tracking nodegraph # `eps` is used to account for the memory used by the python interpreter countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3)) log_info('building k-mer tracking graph') tracking = khmer_args.create_matching_nodegraph(countgraph) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph is not None: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)
def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, mask=None, maskmaxabund=1, numbands=None, band=None, outfile=None, numthreads=1, logfile=sys.stderr): """ Compute k-mer abundances for the specified sequence input. Expected input is a list of one or more FASTA/FASTQ files corresponding to a single sample. A counttable is created and populated with abundances of all k-mers observed in the input. If `mask` is provided, only k-mers not present in the mask will be loaded. """ message = 'loading from ' + ','.join(seqfiles) print('[kevlar::count] ', message, file=logfile) sketch = khmer.Counttable(ksize, memory / 4, 4) n, nkmers = 0, 0 for seqfile in seqfiles: parser = khmer.ReadParser(seqfile) threads = list() for _ in range(numthreads): if mask: if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding_with_mask, args=( parser, numbands, band, mask, ), ) else: thread = threading.Thread( target=sketch.consume_seqfile_with_mask, args=( parser, mask, ), ) else: if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding, args=( parser, numbands, band, ), ) else: thread = threading.Thread( target=sketch.consume_seqfile, args=(parser, ), ) threads.append(thread) thread.start() for thread in threads: thread.join() message = 'done loading reads' if numbands: message += ' (band {:d}/{:d})'.format(band + 1, numbands) fpr = kevlar.sketch.estimate_fpr(sketch) message += ';\n {:d} reads processed'.format(parser.num_reads) message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers()) message += ';\n estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' message = '[kevlar::count] ' + message raise kevlar.sketch.KevlarUnsuitableFPRError(message) if outfile: if not outfile.endswith(('.ct', '.counttable')): outfile += '.counttable' sketch.save(outfile) message += ';\n saved to "{:s}"'.format(outfile) print('[kevlar::count] ', message, file=logfile) return sketch
return to_return # Initialize the counters # TODO: note, I could be doing a partial dedup here, just to reduce the memory usage... counter = Counters() def map_func(sequence): return counter.process_seq(sequence) pool = multiprocessing.Pool(processes=num_threads) if verbose: print("Start streaming") t0 = timeit.default_timer() # populate the queue fid = khmer.ReadParser(query_file) # This is faster than screed match_tuples = [] #num_reads_per_core = 100000 num_reads_per_chunk = num_reads_per_core * num_threads to_proc = [record.sequence for record in islice(fid, num_reads_per_chunk)] i = 0 while to_proc: i += len(to_proc) if verbose: print("Read in %d sequences" % i) res = pool.map(map_func, to_proc, chunksize=max( 1, min(num_reads_per_core, len(to_proc) / num_threads)))
def sequenceToHistograma(nome, sequence): ksize = 3 nkmers = 4**ksize tablesize = nkmers + 10 # Initialize countgraph cg = khmer.Countgraph(ksize, tablesize, 1) # print('Created a countgraph with', cg.hashsizes(), 'buckets') # start loading # auxNome = "sequenciaAuxliar.fa" # aux = open(auxNome, 'w') # aux.write(nome) # aux.write(sequence+"\n") # aux.close() # fp = TemporaryFile('w+t') # fp = ff.TemporaryFile(mode='w+t', suffix=".fasta") with NamedTemporaryFile(prefix="lucas", suffix=".fasta", delete=False, mode="w+t") as fp: fp.write(nome) fp.write(sequence) fp.seek(0) # print(fp.name) # print(fp.read()) # print(fp.read()) fp.close() rparser = khmer.ReadParser(fp.name) # fp.close() # os.remove(fp.name) # os.unlink(fp.name) # rparser2 = rparser # aux.closes # os.remove(auxNome) # rparser = khmer.ReadParser(sequence) threads = [] for _ in range(1): thread = \ threading.Thread( target=cg.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() # print('unique', cg.n_unique_kmers()) h = Histograma(nome, cg.n_unique_kmers(), nkmers, tablesize, len(sequence)) abundance_lists = [] tracking = khmer_args.create_matching_nodegraph(cg) def __do_abundance_dist__(read_parser): abundances = cg.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) # with NamedTemporaryFile(prefix="lucas", suffix=".fasta", delete=False, mode="w+t") as fp: # fp.write(nome) # fp.write(sequence) # fp.seek(0) # print(fp.name) rparser2 = khmer.ReadParser(fp.name) # fp.close() # os.remove(fp.name) # # os.unlink(fp.name) threads = [] for _ in range(1): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser2, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == 1, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.") print("\tPlease verify that the input files are valid.") return 0 sofar = 0 line = 0 for _, i in sorted(abundance.items()): if i == 0 and line < h.tablesize: continue sofar += i frac = sofar / float(total) #hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) # print(line, tablesize, [_, i, sofar, round(frac, 3)]) h.histo[line][0] = _ h.histo[line][1] = i h.histo[line][2] = sofar h.histo[line][3] = round(frac, 3) line = line + 1 if sofar == total: break return h
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print('Saving k-mer counting table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print('mid-save', base, file=sys.stderr) htable.save(base) with open(base + '.info', 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print('Total number of unique k-mers:', n_kmers, file=sys.stderr) with open(base + '.info', 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) print('saving', base, file=sys.stderr) htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print("Writing summmary info to", mr_file, file=sys.stderr) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('DONE.', file=sys.stderr) print('wrote to:', base + '.info', file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') if args.csv: hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) print('making k-mer counting table', file=sys.stderr) counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) counting_hash.set_use_bigcount(args.bigcount) print('building k-mer tracking table', file=sys.stderr) tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print('kmer_size:', counting_hash.ksize(), file=sys.stderr) print('k-mer counting table sizes:', counting_hash.hashsizes(), file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print('preparing hist from %s...' % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 2 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print(_, i, sofar, round(frac, 3), file=hist_fp) if sofar == total: break if args.savetable: print('Saving k-mer counting table ', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) counting_hash.save(args.savetable) print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') parser.add_argument('--bfout', help='output bloom filter of ref') group = parser.add_mutually_exclusive_group() group.add_argument('--shared', dest='output', action='store_const', const='shared', help='output shared kmers') group.add_argument('--uniq', dest='output', action='store_const', const='uniq', help='output uniq kmers in query') group2 = parser.add_mutually_exclusive_group(required=True) group2.add_argument( '--ref', nargs='+', help='fasta sequence file to be loaded in bloom filter') group2.add_argument('--load', help='load existing bloom filter') parser.set_defaults(output='uniq') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query output = args.output start_time = time.time() # load from existing bloom filter if args.load: print('loading bloom filter from {}..'.format(args.load), file=sys.stderr) ht = khmer.load_nodetable(args.load) k = ht.ksize() mes = ('*** incompatible ksize ({}) in {} with parameters K on ' 'command line ({})') assert k == K, mes.format(k, args.load, K) end_time = time.time() secs = end_time - start_time mes = 'load bloom filter ({}) took {:.2f} hours..' print(mes.format(os.path.basename(args.load), secs / 3600.0), file=sys.stderr) # create a hashbits data structure else: refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ( '*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue if args.bfout: if args.load: mes = '*** Bloom filter exists as {}, NOT saving again as {}..' print(mes.format(args.load, args.bfout), file=sys.stderr) else: print('*** Saving bloom filter to {}..'.format(args.bfout), file=sys.stderr) ht.save(args.bfout) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() # create a hashbits data structure ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2) n_unique2 = 0 n_shared = 0 if output == 'uniq': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 else: mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) ht2.count(kmer) elif output == 'shared': for n, record in enumerate(khmer.ReadParser(query)): #for n, record in enumerate(screed.open(query)): _l = record.name.split(None, 1) if len(_l) == 2: name, desc = _l else: name = _l[0] desc = '' sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if (not ht2.get(kmer)): n_unique2 += 1 if ht.get(kmer): n_shared += 1 mes = '>{}__{} {}||length_{};k_{}\n{}' print(mes.format(name, i, desc, seq_len, K, kmer)) else: pass ht2.count(kmer) mes = ('Unique kmer in {} (query):\t{}\n' 'Shared kmer:\t{}\n' 'Unique kmer in {}:\t{}\n') print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs', n_unique1), file=sys.stderr)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >> sys.stderr, 'Saving k-mer counting table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >> sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >> info_fp, 'Total number of unique k-mers:', n_kmers print >> sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >> sys.stderr, 'DONE.' print >> sys.stderr, 'wrote to:', base + '.info'
def multi_file_iter_khmer(filenames): for filename in filenames: for record in khmer.ReadParser(filename): yield record
def test_read_bundler_single_read(): infile = utils.get_test_data('single-read.fq') records = [r for r in khmer.ReadParser(infile)] bundle = khmer.utils.ReadBundle(*records) assert bundle.num_reads == 1 assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
def main(): parser = argparse.ArgumentParser( description= "This script creates a CSV file of similarity indicies between the" " input file and each of the sketches in the training/reference file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-f', '--force', action="store_true", help="Force creation of new NodeGraph.") parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-ct', '--containment_threshold', type=restricted_float, help="Only return results with containment index above this value", default=0.02) parser.add_argument( '-c', '--confidence', type=restricted_float, help= "Desired probability that all results were returned with containment index above threshold [-ct]", default=0.95) parser.add_argument( '-ng', '--node_graph', help="NodeGraph/bloom filter location. Used if it exists; if not, one " "will be created and put in the same directory as the specified " "output CSV file.", default=None) parser.add_argument( '-b', '--base_name', action="store_true", help= "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file" ) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Option to only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space.") parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument( 'training_data', help= "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)" ) parser.add_argument('out_csv', help='Output CSV file') # Parse and check args args = parser.parse_args() base_name = args.base_name training_data = os.path.abspath(args.training_data) if not os.path.exists(training_data): raise Exception("Training/reference file %s does not exist." % training_data) # Let's get the k-mer sizes in the training database ksizes = set() # Import all the training data sketches = MH.import_multiple_from_single_hdf5(training_data) # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters)) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) num_hashes = len(sketches[0]._kmers) for i in range(len(sketches)): sketch = sketches[i] if sketch._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) if len(sketch._kmers) != num_hashes: raise Exception("Unequal number of hashes for sketch of %s" % sketch.input_file_name) ksizes.add(sketch.ksize) if len(ksizes) > 1: raise Exception( "Training/reference data uses different k-mer sizes. Culprit was %s." % (sketch.input_file_name)) # Get the appropriate k-mer size ksize = ksizes.pop() # Get number of threads to use num_threads = args.threads # Check and parse the query file query_file = os.path.abspath(args.in_file) if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size> if args.node_graph is None: # If no node graph is specified, create one node_graph_out = os.path.join( os.path.dirname(os.path.abspath(args.out_csv)), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if not os.path.exists( node_graph_out ): # Don't complain if the default location works print("Node graph not provided (via -ng). Creating one at: %s" % node_graph_out) elif os.path.exists( args.node_graph): # If one is specified and it exists, use it node_graph_out = args.node_graph else: # Otherwise, the specified one doesn't exist raise Exception("Provided NodeGraph %s does not exist." % args.node_graph) # import and check the intersect nodegraph if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( training_data)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) results_file = os.path.abspath(args.out_csv) force = args.force fprate = args.fp_rate coverage_threshold = args.containment_threshold # desired coverage cutoff confidence = args.confidence # desired confidence that you got all the organisms with coverage >= desired coverage # Get names of training files for use as rows in returned tabular data training_file_names = [] for i in range(len(sketches)): training_file_names.append(sketches[i].input_file_name) # Only form the Nodegraph if we need to global sample_kmers if not os.path.exists(node_graph_out) or force is True: hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Not technically correct, but I need to wait until khmer is updated if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) else: sample_kmers = khmer.load_nodegraph(node_graph_out) node_ksize = sample_kmers.ksize() if node_ksize != ksize: raise Exception( "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k." % (node_graph_out, node_ksize, ksize)) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) #num_sample_kmers = sample_kmers.n_unique_kmers() # For some reason this only works when creating a new node graph, use the following instead num_sample_kmers = sample_kmers.n_occupied() # Compute all the indicies for all the training data pool = Pool(processes=num_threads) res = pool.map( unwrap_compute_indicies, zip(sketches, repeat(num_sample_kmers), repeat(true_fprate))) # Gather up the results in a nice form intersection_cardinalities = np.zeros(len(sketches)) containment_indexes = np.zeros(len(sketches)) jaccard_indexes = np.zeros(len(sketches)) for i in range(len(res)): (intersection_cardinality, containment_index, jaccard_index) = res[i] intersection_cardinalities[i] = intersection_cardinality containment_indexes[i] = containment_index jaccard_indexes[i] = jaccard_index d = { 'intersection': intersection_cardinalities, 'containment index': containment_indexes, 'jaccard index': jaccard_indexes } # Use only the basenames to label the rows (if requested) if base_name is True: df = pd.DataFrame(d, map(os.path.basename, training_file_names)) else: df = pd.DataFrame(d, training_file_names) # Only get the rows above a certain threshold if coverage_threshold <= 0: est_threshold = 0 else: est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate, confidence) filtered_results = df[df['containment index'] > est_threshold].sort_values( 'containment index', ascending=False) # Export the results filtered_results.to_csv(results_file, index=True, encoding='utf-8')
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.), args.force) print >> sys.stderr, 'Saving k-mer presence table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >> sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >> sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >> sys.stderr, 'consuming input', filename for num in xrange(args.threads): cur_thread = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >> sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >> sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" if not args.force: sys.exit(1) print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset'
def main(): parser = build_construct_args( "Output k-mer abundance distribution (single file version).") add_threading_args(parser) parser.add_argument('datafile') parser.add_argument('histout') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savehash', dest='savehash', default='') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) datafile = args.datafile histout = args.histout print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) ht.set_use_bigcount(args.bigcount) print 'building tracking ht' K = ht.ksize() sizes = ht.hashsizes() tracking = khmer._new_hashbits(K, sizes) print 'K:', K print 'HT sizes:', sizes print 'outputting to', histout config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) # start loading rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 1 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() z_list = [] def do_abundance_dist(r): z = ht.abundance_distribution_with_reads_parser(r, tracking) z_list.append(z) print 'preparing hist from %s...' % datafile rparser = khmer.ReadParser(datafile, n_threads) threads = [] print 'consuming input, round 2 --', datafile for tnum in xrange(n_threads): t = \ threading.Thread( target=do_abundance_dist, args=(rparser,) ) threads.append(t) t.start() for t in threads: t.join() assert len(z_list) == n_threads, len(z_list) z = {} for zz in z_list: for i, count in enumerate(zz): z[i] = z.get(i, 0) + count total = sum(z.values()) if 0 == total: print >>sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(-1) fp = open(histout, 'w') sofar = 0 for n, i in sorted(z.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> fp, n, i, sofar, round(frac, 3) if sofar == total: break if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) info_filename = base + ".info" check_file_writable(base) check_file_writable(info_filename) log_info('Saving k-mer countgraph to {base}', base=base) log_info('Loading kmers from sequences in {filenames}', filenames=repr(filenames)) # clobber the '.info' file now, as we always open in append mode below with open(info_filename, 'w') as info_fp: print('khmer version:', khmer.__version__, file=info_fp) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) countgraph.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) log_info('mid-save {base}', base=base) countgraph.save(base) with open(info_filename, 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() log_info('Total number of unique k-mers: {nk}', nk=n_kmers) with open(info_filename, 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) log_info('saving {base}', base=base) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(info_filename, 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt log_info("Writing summmary info to {mr_file}", mr_file=mr_file) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('DONE.') log_info('wrote to: {filename}', filename=info_filename)
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('ref', nargs='+', help='fasta sequence file to be loaded in hashtable') parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) # create a hashbits data structure start_time = time.time() ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ('*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() pair = 0 forward = 0 reverse = 0 other = 0 total_pair = 0 for n, is_pair, r1, r2 in broken_paired_reader( khmer.ReadParser(query, require_paired=True)): #for n, record in enumerate(screed.open(query)): total_pair += 1 share_list = [] for record in [r1, r2]: name, desc = record.name.split(None, 1) sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: print('*** {} is shorter than {}..'.format(r1.name, K), file=sys.stderr) continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if ht.get(kmer): share_list.append(1) break else: share_list.append(0) if share_list == [1, 1]: pair += 1 elif share_list == [1, 0]: forward += 1 elif share_list == [0, 1]: reverse += 1 else: #[0, 0] other += 1 # do not print continue mes = ('>{} {}||uniq_{}\n{}\n' '>{} {}||uniq_{}\n{}') l1 = r1.name.split(None, 1) l2 = r2.name.split(None, 1) print( mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1], share_list[1], r2.sequence)) mes = ('Unique kmer in ref:\t{}\n' 'Total pair:\t{}\n' 'Both primers uniq:\t{}\n' 'Pair with forward uniq:\t{}\n' 'Pair with reverse uniq:\t{}') print(mes.format(n_unique1, total_pair, pair, forward, reverse), file=sys.stderr)