def main(): info("annotate-partitions.py", ["graph"]) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.Hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + ".pmap.merged" check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print("loading partition map from:", partitionmap_file, file=sys.stderr) htable.load_partitionmap(partitionmap_file) for infile in filenames: print("outputting partitions for", infile, file=sys.stderr) outfile = os.path.basename(infile) + ".part" part_count = htable.output_partitions(infile, outfile) print("output %d partitions for %s" % (part_count, infile), file=sys.stderr) print("partitions are in", outfile, file=sys.stderr)
def main(): info("count-median.py", ["diginorm"]) args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print("loading k-mer countgraph from", htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print("writing to", output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(["name", "median", "average", "stddev", "seqlen"]) for record in screed.open(input_filename): seq = record.sequence.upper() if "N" in seq: seq = seq.replace("N", "A") if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) htable.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): args = sanitize_help(get_parser()).parse_args() ksize = args.ksize filenames = args.input_filenames nodegraph = Nodegraph(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print('loading partition map from:', partitionmap_file, file=sys.stderr) nodegraph.load_partitionmap(partitionmap_file) for infile in filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print >>sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) print >>sys.stderr, 'saving merged to', output_file htable.save_partitionmap(output_file) if args.remove_subsets: print >>sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print >> sys.stderr, 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print >> sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >> sys.stderr, 'output %d partitions for %s' % (part_count, infile) print >> sys.stderr, 'partitions are in', outfile
def main(): info('filter-abund.py', ['counting']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading countgraph:', args.input_graph, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize nodegraph = khmer.Nodegraph(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) nodegraph.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer countgraph from', htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print('writing to', output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print >>sys.stderr, 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames nodegraph = khmer.Nodegraph(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print('loading partition map from:', partitionmap_file, file=sys.stderr) nodegraph.load_partitionmap(partitionmap_file) for infile in filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % ( part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def test_check_file_status_kfile(): fn = utils.get_temp_filename('thisfiledoesnotexist') check_file_status_exited = False try: check_input_files(fn, False) except SystemExit: check_file_status_exited = True assert check_file_status_exited
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='nodegraph') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) # if optimization args are given, do optimization args = functions.do_sanity_checking(args, 0.01) check_space(args.input_filenames, args.force) check_space_for_hashtable(args, 'nodegraph', args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) htable = khmer_args.create_nodegraph(args) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) print('saving k-mer presence table in', base + '.pt', file=sys.stderr) htable.save(base + '.pt') if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to', base + '.info and', base + '.pt', file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(args): graph_type = 'nodegraph' report_on_config(args, graphtype=graph_type) base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, graph_type) space_needed = (args.n_tables * graphsize / khmer._buckets_per_byte[graph_type]) check_space_for_graph(args.output_filename, space_needed, args.force) print('Saving k-mer nodegraph to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format( nodegraph.n_unique_kmers()), file=sys.stderr) print('saving k-mer nodegraph in', base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) nodegraph.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to ' + base + '.info and ' + base, file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(): info('optimal_args_hashbits.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print( "** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = output_gen(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): info('optimal_args_nodegraph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print("** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = graphsize_args_report(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def test_check_file_status_kfile(): fn = utils.get_temp_filename('thisfiledoesnotexist') old_stderr = sys.stderr sys.stderr = capture = StringIO() try: check_input_files(fn, False) except SystemExit: assert "does not exist" in capture.getvalue(), capture.getvalue() finally: sys.stderr = old_stderr
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >>sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >>sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset' sys.exit(0)
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >> sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >> sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading counting table:', args.input_table, file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('count-kmers-single.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_sequence_filename, False) print('making k-mer countgraph', file=sys.stderr) countgraph = khmer.Countgraph(args.ksize, args.max_tablesize, args.n_tables) # @CTB countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) print('kmer_size: %s' % countgraph.ksize(), file=sys.stderr) print('k-mer countgraph sizes: %s' % (countgraph.hashsizes(), ), file=sys.stderr) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 -- %s' % (args.input_sequence_filename), file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() for record in screed.open(args.input_sequence_filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i + kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def main(): args = get_parser().parse_args() infiles = [args.input_count_graph_filename] + args.input_sequence_filenames for infile in infiles: check_input_files(infile, False) counts = khmer.load_countgraph(args.input_count_graph_filename) results = find_N_most_abundant_kmers(args.input_sequence_filenames, args.N, counts) results_df = pd.DataFrame({'kmer': [str(k) for k in results.keys()], 'count': [int(c) for c in results.values()]}) results_df.sort_values(by='count', inplace=True, ascending=False) results_df.to_csv(args.output, index=False)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): info('count-kmers-single.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_sequence_filename, False) print ('making k-mer countgraph', file=sys.stderr) countgraph = khmer.Countgraph(args.ksize, args.max_tablesize, args.n_tables) # @CTB countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer.Nodegraph( # pylint: disable=protected-access kmer_size, 1, 1, primes=hashsizes) print ('kmer_size: %s' % countgraph.ksize(), file=sys.stderr) print ('k-mer countgraph sizes: %s' % (countgraph.hashsizes(),), file=sys.stderr) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print ('consuming input, round 1 -- %s' % (args.input_sequence_filename), file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() for record in screed.open(args.input_sequence_filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right fail = False print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): info('make-initial-stoptags.py', ['graph']) args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = khmer.load_nodegraph(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) divvy = list(divvy) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(): args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = Nodegraph.load(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) divvy = list(divvy) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD, subs=subset) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def test_check_file_status_kfile_force(): fn = utils.get_temp_filename('thisfiledoesnotexist') old_stderr = sys.stderr sys.stderr = capture = StringIO() try: check_input_files(fn, True) except OSError: assert False finally: sys.stderr = old_stderr assert "does not exist" in capture.getvalue(), capture.getvalue()
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print >>sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.0), args.force) print("Saving k-mer presence table to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making k-mer presence table", file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(htable.n_unique_kmers()), file=sys.stderr) print("saving k-mer presence table in", base + ".pt", file=sys.stderr) htable.save(base + ".pt") if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to", base + ".info and", base + ".pt", file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, graphtype="nodegraph") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) graphsize = calculate_graphsize(args, "nodegraph") check_space_for_graph(args.output_filename, graphsize, args.force) print("Saving k-mer nodegraph to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making nodegraph", file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(nodegraph.n_unique_kmers()), file=sys.stderr) print("saving k-mer nodegraph in", base, file=sys.stderr) nodegraph.save(base) if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) nodegraph.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % nodegraph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to " + base + ".info and " + base, file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_input_files(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >>sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_input_files(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print('reading file "%s"' % args.infile, file=sys.stderr) print('outputting interleaved pairs to "%s.pe"' % outfile, file=sys.stderr) print('outputting orphans to "%s.se"' % outfile, file=sys.stderr) n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print('DONE; read %d sequences,' ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) print('wrote to: ' + outfile + '.se' + ' and ' + outfile + '.pe', file=sys.stderr)
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer counting table from', htfile, file=sys.stderr) htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print('writing to', output_filename, file=sys.stderr) output = open(output_filename, 'w') if args.csv: output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) parse_description = True # @legacy behavior: split seq headers if args.csv: parse_description = False # only enable if we're doing csv out for record in screed.open(input_filename, parse_description=parse_description): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] if args.csv: output.writerow([record.name, medn, ave, stdev, len(seq)]) else: print(record.name, medn, ave, stdev, len(seq), file=output)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print(list_curve[100 + i], list_curve[i], file=f_curve_obj) print('wrote to: ' + args.report_filename, file=sys.stderr)
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): name = record.name seq = record.sequence if 'N' in seq: return None, None trim_seq, trim_at = nodegraph.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('merge-stoptags.py') args = get_parser().parse_args() stdbase = args.stdbase # @RamRS: This might need some more work infiles = [] for _ in glob.glob(stdbase + "*/*.stoptags"): if os.path.exists(_): check_input_files(_, False) infiles.append(_) check_space(infiles, False) ht = khmer.new_hashbits(args.ksize, 1, 1) for _ in infiles: print >> sys.stderr, 'loading stoptags %s' % _ ht.load_stop_tags(_, 0) print >> sys.stderr, 'writing file merge.stoptags' ht.save_stop_tags('merge.stoptags') print >> sys.stderr, 'done!'
def main(): # pylint: disable=too-many-locals,too-many-statements args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph.py print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_seqfile_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if not args.keep_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % args.graphbase print >>sys.stderr, 'Loading kmers from sequences in %s' % \ repr(args.input_filenames) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads print >>sys.stderr, '--' # load-graph print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print >>sys.stderr, 'consuming input', filename htable.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps: ', \ 'stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % args.threads print >>sys.stderr, '---' threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (args.graphbase,) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print >>sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print >>sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) screed_iter = screed.open(filename, parse_description=False) for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( screed_iter, force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile check_input_files(infile, args.force) check_space([infile], args.force) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_paired and args.output_single): print("Accepting input from stdin; output filenames must be " "provided.", file=sys.stderr) sys.exit(1) elif args.output_dir: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) out1 = args.output_dir + '/' + os.path.basename(infile) + '.se' out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe' else: out1 = os.path.basename(infile) + '.se' out2 = os.path.basename(infile) + '.pe' # OVERRIDE default output file locations with -p, -s if args.output_paired: paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip) out2 = paired_fp.name else: # Don't override, just open the default filename from above paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) if args.output_single: single_fp = get_file_writer(args.output_single, args.gzip, args.bzip) out1 = args.output_single.name else: # Don't override, just open the default filename from above single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) print('reading file "%s"' % infile, file=sys.stderr) print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr) print('outputting orphans to "%s"' % out1, file=sys.stderr) n_pe = 0 n_se = 0 screed_iter = ReadParser(infile) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print('DONE; read %d sequences,' ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) print('wrote to: %s and %s' % (out2, out1), file=sys.stderr)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( "`reservoir sampling\n" "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.", "reservoir sampling algorithm. " "http://en.wikipedia.org/wiki/Reservoir_sampling") args = sanitize_help(parser).parse_args() for name in args.filenames: check_input_files(name, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be if args.output_file: output_filename = args.output_file.name if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: print( "NOTE: This can be overridden using the --force" " argument", file=sys.stderr) sys.exit(1) else: filename = args.filenames[0] if filename in ('/dev/stdin', '-'): print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) output_filename = os.path.basename(filename) + '.subset' filename = args.filenames[0] if filename in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" filename = '-' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for _ in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) for count, (_, _, rcrd1, rcrd2) in enumerate( broken_paired_reader(ReadParser(filename), force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for sample in range(num_samples): reads[sample].append((rcrd1, rcrd2)) else: for sample in range(num_samples): assert len(reads[sample]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) output_file = args.output_file if not output_file: output_file = open(output_filename, 'wb') output_file = get_file_writer(output_file, args.gzip, args.bzip) for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = get_file_writer(open(n_filename, 'wb'), args.gzip, args.bzip) for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)