def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def main(): info('filter-abund.py', ['counting']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading countgraph:', args.input_graph, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage.") parser.add_argument('--max-error-region', '-M', dest='max_error_region', default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.corr' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage." ) parser.add_argument( "--max-error-region", "-M", dest="max_error_region", default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed", ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record["name"] seq = record["sequence"] seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace("-", "") seq = graph_seq return name, seq # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".corr" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance." ) parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="base variable-coverage cutoff on this median k-mer abundance", default=DEFAULT_NORMALIZE_LIMIT, ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading counting table:', args.input_table, file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_) check_space(infiles) print 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print >>sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-o', '--outputpath', dest='outputpath', default='.') args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames outpath = args.outputpath print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size', dest='ksize') parser.add_argument('stoptags_file') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize stoptags = args.stoptags_file infiles = args.input_filenames print 'loading stop tags, with K', K ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.dirname(infile) + '/' + os.path.basename( infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at reads above this median abundance.") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, _, _ = ht.get_median_count(seq) if med >= args.cutoff: return name, seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.himed' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size', dest='ksize') parser.add_argument('stoptags_file') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize stoptags = args.stoptags_file infiles = args.input_filenames print 'loading stop tags, with K', K ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.dirname(infile) + '/' + os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print('file with ht: %s' % counting_ht) print('loading hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() print("K:", K) # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile)
def main(): info("filter-stoptags.py", ["graph"]) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_) check_space(infiles) print "loading stop tags, with K", args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".stopfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): name = record.name seq = record.sequence if 'N' in seq: return None, None trim_seq, trim_at = nodegraph.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): parser = build_counting_multifile_args() parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".medfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >>sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >>sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >>sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >>sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >>sys.stderr, 'wrote to: ', outfile
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) print('making countgraph', file=sys.stderr) graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() print('Total number of unique k-mers: {0}'.format(graph.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(graph, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfile = open(outfile, 'wb') outfp = get_file_writer(outfile, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile.name, file=sys.stderr) if args.savegraph: print('Saving k-mer countgraph filename', args.savegraph, file=sys.stderr) graph.save(args.savegraph)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) report_on_config(args) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile, file=sys.stderr) if args.savetable: print('Saving k-mer counting table filename', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) print('wrote to: ', outfile, file=sys.stderr)
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base variable-coverage cutoff on this median' ' k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads, verbose=not args.quiet) tsp.start(verbose_loader(infile), outfp) log_info('output in {outfile}', outfile=outfile)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = graph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile ### first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate ### now, trim. ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print 'output in', outfile if args.savetable: print 'Saving k-mer counting table filename', args.savetable print '...saving to', args.savetable htable.save(args.savetable)