def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('making hashtable') ht = Countgraph.load(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, force_single=True) for n, is_pair, read1, read2 in paired_iter: name = read1.name seq = read1.sequence if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: write_record(screed.Record(name=name, sequence=trim_seq), outfp)
def paired_reader(readstream): i = 0 for n, ispaired, read1, read2 in broken_paired_reader(readstream): i += 1 yield i, read1, read2 if ispaired: i += 1 yield i, read2, read1
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def gather(self, **kw): iter = broken_paired_reader(self.stream, **kw) x = [] m = 0 for n, is_pair, read1, read2 in iter: if is_pair: x.append((read1.name, read2.name)) else: x.append((read1.name, None)) m += 1 return x, n, m
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def test_diginorm(): filename = 'test_files/simple-metagenome-reads.fa' graph = khmer.new_counting_hash(20, 1e7, 4) out_fp = open(os.path.basename(filename) + '.keep', 'w') ## khmer scripts/normalize-by-median.py, using generators input_iter = screed.open(filename) input_iter = broken_paired_reader(input_iter) input_iter = clean_reads(input_iter) input_iter = diginorm(input_iter, graph, 20) script_result = screed.open('test_files/' 'simple-metagenome-reads.fa.keep.k20.C20') for read_a, read_b in zip(broken_paired_to_single(input_iter), script_result): print read_a.name assert read_a == read_b, (read_a, read_b)
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_input_files(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >>sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >> sys.stderr, 'reading file "%s"' % args.infile print >> sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >> sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >> sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_seqfile(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_fasta(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def test_BrokenPairedReader_lowercase(): stream = [screed.Record(name='seq1/1', sequence='acgtn'), screed.Record(name='seq1/2', sequence='AcGtN'), screed.Record(name='seq1/2', sequence='aCgTn')] results = [] for num, is_pair, read1, read2 in broken_paired_reader(stream): results.append((read1, read2)) a, b = results[0] assert a.sequence == 'acgtn' assert a.cleaned_seq == 'ACGTA' assert b.sequence == 'AcGtN' assert b.cleaned_seq == 'ACGTA' c, d = results[1] assert c.sequence == 'aCgTn' assert c.cleaned_seq == 'ACGTA' assert d is None
def test_BrokenPairedReader_lowercase_khmer_Read(): # use khmer.Read objects which should automatically have a `cleaned_seq` # attribute stream = [khmer.Read(name='seq1/1', sequence='acgtn'), khmer.Read(name='seq1/2', sequence='AcGtN'), khmer.Read(name='seq1/2', sequence='aCgTn')] results = [] for num, is_pair, read1, read2 in broken_paired_reader(stream): results.append((read1, read2)) a, b = results[0] assert a.sequence == 'acgtn' assert a.cleaned_seq == 'ACGTA' assert b.sequence == 'AcGtN' assert b.cleaned_seq == 'ACGTA' c, d = results[1] assert c.sequence == 'aCgTn' assert c.cleaned_seq == 'ACGTA' assert d is None
def main(): info('trim-low-abund.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadtable: print >> sys.stderr, 'loading k-mer counting table from', args.loadtable ct = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: trimfp = open(os.path.basename(filename) + '.abundtrim', 'w') else: trimfp = args.out pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print >> sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print >> sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >> sys.stderr, 'read %d reads, %d bp' % ( n_reads, n_bp, ) print >> sys.stderr, 'wrote %d reads, %d bp' % ( written_reads, written_bp, ) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >> sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >> sys.stderr, 'output in *.abundtrim' if args.savetable: print >> sys.stderr, "Saving k-mer counting table to", args.savetable ct.save(args.savetable)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('correct-reads.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: check_space_for_graph( args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph ct = khmer.load_countgraph(args.loadgraph) else: print >>sys.stderr, 'making k-mer countgraph' ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print >>sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,) print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \ (n_reads - written_reads, corrected_reads, percent_reads_corrected) print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >>sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >>sys.stderr, 'output in *.corr' if args.savegraph: print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph ct.save(args.savegraph)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if infile == '/dev/stdin' or infile == '-': if not (args.output_first and args.output_second): print >> sys.stderr, ("Accepting input from stdin; " "output filenames must be provided.") sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = args.output_first out1 = fp_out1.name else: # Use default filename created above fp_out1 = open(out1, 'w') if args.output_second: fp_out2 = args.output_second out2 = fp_out2.name else: # Use default filename created above fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter) for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) # are we requiring pairs? if args.force_paired and not is_pair: print('ERROR, %s is not part of a pair' % record1.name, file=sys.stderr) sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print("Unrecognized format for read pair information: %s" % name, file=sys.stderr) print("Exiting.", file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right)" % (counter1 + counter2, counter1, counter2), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') args = parser.parse_args() force_single = args.force_single #if args.reads == '-': # args.reads = sys.stdin # check that input files exist check_valid_file_exists(args.input_filenames) filenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # create object of Nodetable in Khmer to use its kh = khmer.Nodetable(args.ksize, 1, 1) # load database mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list def readFusion(read): global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion flag = None lf_ids = set() rt_ids = set() families = [] shared_kmers = [] gaps = [] hashvals = kh.get_kmer_hashes(read.sequence) # find a matching k-mer at the beginning of the read lf = hashvals[0] lf_ids = get_kmer_to_family_ids(lf) idx = 1 while idx < len(hashvals) and len(lf_ids) == 0: lf = hashvals[idx] lf_ids = get_kmer_to_family_ids(lf) idx += 1 if len(lf_ids) == 0: #print('no single match') n_unmatched += 1 flag = "unmatched" elif idx == len(hashvals): #print('same, only last kmer matched') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # len(lf_ids) > 0 & idx < len(hashvals) # find a matching k-mer at the end of the read rt = hashvals[-1] rt_ids = get_kmer_to_family_ids(rt) idy = len(hashvals) - 2 while idy >= idx and len(rt_ids) == 0: rt = hashvals[idy] rt_ids = get_kmer_to_family_ids(rt) idy -= 1 if len(rt_ids) == 0: #print('same, only one non-last kmer matched ') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: intersect_ids = lf_ids.intersection(rt_ids) if len(intersect_ids) > 0: families.append(intersect_ids) if len(intersect_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # fusion to be resolved shared_kmer = 1 gap_size = 0 gap = False while idx <= idy + 1: temp = hashvals[idx] temp_ids = get_kmer_to_family_ids(temp) if len(temp_ids) > 0: intersect_ids = lf_ids.intersection(temp_ids) if len(intersect_ids) > 0: lf_ids = intersect_ids shared_kmer += 1 gap_size = 0 else: # len(intersect_ids) == 0 families.append(lf_ids) shared_kmers.append(shared_kmer) lf_ids = temp_ids shared_kmer = 1 gaps.append(gap_size) gap_size = 0 else: gap_size += 1 idx += 1 families.append(lf_ids) shared_kmers.append(shared_kmer) assert len(families) > 1 if len(families) == 2: if len(families[0]) == 1 and len(families[1]) == 1: n_clear_fusion += 1 flag = "clear_fusion" else: n_ambig_fusion += 1 flag = "ambig_fusion" else: # len(families) > 2 n_mutli_fusion += 1 flag = "multi_fusion" #if len(families) == 0: # families = "-" #if len(shared_kmers) == 0: # shared_kmers = "-" return flag, families, shared_kmers, gaps fusion_filename = args.database + '_fusion.fa' fusion_fp = open(fusion_filename, 'w') fusionInfo_filename = args.database + '_fusion.info' fusionInfo_fp = open(fusionInfo_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "gene_families", "shared_kmers", "gaps", file=fusionInfo_fp, sep='\t') fusionCalc_filename = args.database + '_fusion.calc' fusionCalc_fp = open(fusionCalc_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "familiy_A", "familiy_B", "no_families", "len_families", "shared_kmers", "gaps", "sorted_keys", file=fusionCalc_fp, sep='\t') fusionPairs_filename = args.database + '_fusionPairs.fa' fusPair_fp = open(fusionPairs_filename, 'w') fusionPairsInfo_filename = args.database + '_fusionPairs.info' fusPairInfo_fp = open(fusionPairsInfo_filename, 'w') print("fileName", "recordIndex", "fusion_class", "R1_family", "R2_family", file=fusPairInfo_fp, sep='\t') fusionPairsCalc_filename = args.database + '_fusionPairs.calc' fusPairCalc_fp = open(fusionPairsCalc_filename, 'w') print("fileName", "recordIndex", "fusion_class", "familiy_A", "familiy_B", "len_families", "sorted_keys", file=fusPairCalc_fp, sep='\t') corrupt_files = [] family_names = dict(zip(family_ids.values(), family_ids.keys())) n = 0 n_paired_fusion = 0 sameRef = ("unique", "ambiguous") fusion = ("clear_fusion", "ambig_fusion", "multi_fusion") for filename, require_paired in files: with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp, fusPair_fp, fusPairInfo_fp, fusPairCalc_fp, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) for r_index, is_paired, read0, read1 in reader: n += 1 if n % 10000 == 0: print('...', n) #if n > 5000: # break flag0, families0, shared_kmers0, gaps0 = readFusion(read0) if not is_paired and flag0 in fusion: #families_names0 = [] #for gp in families0: # gp_names = [] # for family_id in gp: # family_name = family_names[family_id] # gp_names.append(family_name) # families_names0.append(gp_names) print(filename, r_index, "single", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) #i = 1 #while i < len(families0): # for g1 in families0[i-1]: # for g2 in families0[i]: # print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]), # shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t') # i += 1 i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "single", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if is_paired: flag1, families1, shared_kmers1, gaps1 = readFusion(read1) if flag0 in fusion or flag1 in fusion: print(filename, r_index, "Read_1", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) print(filename, r_index, "Read_2", flag1, families1, shared_kmers1, gaps1, file=fusionInfo_fp, sep='\t') write_record(read1, fusion_fp) if flag0 in fusion: i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "Read_1", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if flag1 in fusion: i = len(families1) - 1 for g1 in families1[0]: g1_name = family_names[g1] for g2 in families1[i]: g2_name = family_names[g2] print(filename, r_index, "Read_2", flag1, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families1), [len(f) for f in families1], shared_kmers1, gaps1, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') elif flag0 in sameRef and flag1 in sameRef: if len(families0[0].intersection(families1[0])) == 0: n_paired_fusion += 1 if flag0 == "unique" and flag1 == "unique": fusion_class = "clear_fusion" else: fusion_class = "ambig_fusion" print(filename, r_index, fusion_class, families0, families1, file=fusPairInfo_fp, sep='\t') write_record(read0, fusPair_fp) write_record(read1, fusPair_fp) for g1 in families0[0]: g1_name = family_names[g1] for g2 in families1[0]: g2_name = family_names[g2] print(filename, r_index, fusion_class, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), [ len(f) for f in (families0[0], families1[0]) ], sorted([g1, g2]), file=fusPairCalc_fp, sep='\t') print('No of input fragments: ', n) print('unmatched:', n_unmatched) print('Unique:', n_same) print('Ambiguous:', n_amb_same) print('Single read clear fusion:', n_clear_fusion) print('Single read ambiguous fusion:', n_ambig_fusion) print('Single read multi fusion:', n_mutli_fusion) print('paired read fusion:', n_paired_fusion)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be if args.output_file: output_filename = args.output_file.name if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: print("NOTE: This can be overridden using the --force" " argument", file=sys.stderr) sys.exit(1) else: filename = args.filenames[0] if filename in ('/dev/stdin', '-'): print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) screed_iter = screed.open(filename) for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( screed_iter, force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) output_file = args.output_file if not output_file: output_file = open(output_filename, 'wb') output_file = get_file_writer(output_file, args.gzip, args.bzip) for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = get_file_writer(open(n_filename, 'wb'), args.gzip, args.bzip) for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) screed_iter = screed.open(filename, parse_description=False) for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( screed_iter, force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): info('split-paired-reads.py') args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): if not (args.output_first and args.output_second): print( "Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None screed_iter = screed.open(infile) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter, require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >> sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
script_result = screed.open('test_files/' 'simple-metagenome-reads.fa.keep.k20.C20') for read_a, read_b in zip(broken_paired_to_single(input_iter), script_result): print read_a.name assert read_a == read_b, (read_a, read_b) if __name__ == '__main__': filename = sys.argv[1] graph = khmer.new_counting_hash(20, 1e7, 4) out_fp = open(os.path.basename(filename) + '.abundtrim', 'w') ## khmer scripts/trim-low-abund.py -V, using generators input_iter = screed.open(filename) input_iter = broken_paired_reader(input_iter) input_iter = clean_reads(input_iter) input_iter = streamtrim(input_iter, graph, 20, 2) output_reads(input_iter, out_fp) graph = khmer.new_counting_hash(20, 1e7, 4) out_fp = open(os.path.basename(filename) + '.keep', 'w') ## khmer scripts/normalize-by-median.py, using generators input_iter = screed.open(filename) input_iter = broken_paired_reader(input_iter) input_iter = clean_reads(input_iter) input_iter = diginorm(input_iter, graph, 20) output_reads(input_iter, out_fp)
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_first and args.output_second): print("Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(ReadParser(infile), require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >>sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >>sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >>sys.stderr, 'opening', filename, 'for reading' screed_iter = screed.open(filename, parse_description=False) for count, ispair, rcrd1, rcrd2 in broken_paired_reader( screed_iter, force_single=args.force_single): if count % 10000 == 0: print >>sys.stderr, '...', count, 'reads scanned' if count >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads' % \ args.max_reads, '(see -M); exiting' break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >>sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile check_input_files(infile, args.force) check_space([infile], args.force) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_paired and args.output_single): print("Accepting input from stdin; output filenames must be " "provided.", file=sys.stderr) sys.exit(1) elif args.output_dir: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) out1 = args.output_dir + '/' + os.path.basename(infile) + '.se' out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe' else: out1 = os.path.basename(infile) + '.se' out2 = os.path.basename(infile) + '.pe' # OVERRIDE default output file locations with -p, -s if args.output_paired: paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip) out2 = paired_fp.name else: # Don't override, just open the default filename from above paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) if args.output_single: single_fp = get_file_writer(args.output_single, args.gzip, args.bzip) out1 = args.output_single.name else: # Don't override, just open the default filename from above single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) print('reading file "%s"' % infile, file=sys.stderr) print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr) print('outputting orphans to "%s"' % out1, file=sys.stderr) n_pe = 0 n_se = 0 screed_iter = ReadParser(infile) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print('DONE; read %d sequences,' ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) print('wrote to: %s and %s' % (out2, out1), file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** IOErrors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( "`reservoir sampling\n" "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.", "reservoir sampling algorithm. " "http://en.wikipedia.org/wiki/Reservoir_sampling") args = sanitize_help(parser).parse_args() for name in args.filenames: check_input_files(name, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be if args.output_file: output_filename = args.output_file.name if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: print( "NOTE: This can be overridden using the --force" " argument", file=sys.stderr) sys.exit(1) else: filename = args.filenames[0] if filename in ('/dev/stdin', '-'): print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) output_filename = os.path.basename(filename) + '.subset' filename = args.filenames[0] if filename in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" filename = '-' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for _ in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) for count, (_, _, rcrd1, rcrd2) in enumerate( broken_paired_reader(ReadParser(filename), force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for sample in range(num_samples): reads[sample].append((rcrd1, rcrd2)) else: for sample in range(num_samples): assert len(reads[sample]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) output_file = args.output_file if not output_file: output_file = open(output_filename, 'wb') output_file = get_file_writer(output_file, args.gzip, args.bzip) for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = get_file_writer(open(n_filename, 'wb'), args.gzip, args.bzip) for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}' .format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements start_time = time.time() parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph1 = Countgraph.load(args.loadgraph) # load second counting table. if args.loadgraph2: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph2) countgraph2 = Countgraph.load(args.loadgraph2) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for _, is_paired, read0, read1 in reader: for record in snarf(is_paired, read0, read1, countgraph1, countgraph2): if record is not None: write_record(record, outfp) print("--- %s seconds ---" % (time.time() - start_time))
def main(): info('trim-low-abund.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadtable: print >>sys.stderr, 'loading k-mer counting table from', args.loadtable ct = khmer.load_counting_hash(args.loadtable) else: print >>sys.stderr, 'making k-mer counting table' ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) trimfilename = os.path.basename(filename) + '.abundtrim' pass2list.append((filename, pass2filename, trimfilename)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') trimfp = open(trimfilename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() trimfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. trimfp = open(trimfilename, 'a') for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print 'read %d reads, %d bp' % (n_reads, n_bp,) print 'wrote %d reads, %d bp' % (written_reads, written_bp,) print 'looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes) print 'removed %d reads and trimmed %d reads (%.2f%%)' % \ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) print 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print '%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov) print 'skipped %d reads/%d bases because of low coverage' % \ (skipped_n, skipped_bp) fp_rate = khmer.calc_expected_collisions(ct) print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1) print 'output in *.abundtrim' if args.savetable: print >>sys.stderr, "Saving k-mer counting table to", args.savetable ct.save(args.savetable)
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.output is None: trimfp = get_file_writer(open(os.path.basename(filename) + '.abundtrim', 'wb'), args.gzip, args.bzip) else: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): parser = build_nodegraph_args("find uniq kmer in query compard to refs") parser.add_argument('query', help=('fasta readfile to query against' 'hashtable, use "-" if from stdin')) parser.add_argument('ref', nargs='+', help='fasta sequence file to be loaded in hashtable') parser.add_argument('--x2', default='1e8', help='max_table size for readfile2') parser.add_argument('--N2', default='4', help='# of table (N) for readfile2') args = parser.parse_args() #print(args, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables HT_SIZE2 = int(float(args.x2)) N_HT2 = int(args.N2) # positional query = args.query refs = args.ref print('{} refs to be loaded'.format(len(refs)), file=sys.stderr) if query == '-' and refs == ['-']: print('*** query and ref can not both be "-" (read from stdin)', file=sys.stderr) # create a hashbits data structure start_time = time.time() ht = khmer.Nodetable(K, HT_SIZE, N_HT) end_time = time.time() secs = end_time - start_time mes = 'initiation of bloom filter took {:.2f} hours..' print(mes.format(secs / 3600.0), file=sys.stderr) for index, filename in enumerate(refs): if index != 0 and index % 100 == 0: end_time = time.time() secs = end_time - start_time mes = '{} refs have been loaded with in {:.2f} hours ..' print(mes.format(index, secs / 3600.0), file=sys.stderr) try: ht.consume_seqfile(filename) except OSError as e: mes = ('*** Skipping due to OSError (machine or system problem):' ' {}\n' '*** Detailed error message:\n' '*** {}') print(mes.format(os.path.basename(filename), str(e)), file=sys.stderr) continue # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) mes = 'fp rate estimated to be {:1.3f}' print(mes.format(fp_rate), file=sys.stderr) if fp_rate > 0.01: mes = ('**\n' '** ERROR: the counting hash is too small for\n' '** refs. Increase hashsize/num ht.\n' '**\n' '** Do not use these results!!') sys.exit(-1) n_unique1 = ht.n_unique_kmers() pair = 0 forward = 0 reverse = 0 other = 0 total_pair = 0 for n, is_pair, r1, r2 in broken_paired_reader( khmer.ReadParser(query, require_paired=True)): #for n, record in enumerate(screed.open(query)): total_pair += 1 share_list = [] for record in [r1, r2]: name, desc = record.name.split(None, 1) sequence = record.sequence.replace('N', 'A') seq_len = len(sequence) if seq_len < K: print('*** {} is shorter than {}..'.format(r1.name, K), file=sys.stderr) continue for i in range(0, seq_len + 1 - K): kmer = sequence[i:i + K] if ht.get(kmer): share_list.append(1) break else: share_list.append(0) if share_list == [1, 1]: pair += 1 elif share_list == [1, 0]: forward += 1 elif share_list == [0, 1]: reverse += 1 else: #[0, 0] other += 1 # do not print continue mes = ('>{} {}||uniq_{}\n{}\n' '>{} {}||uniq_{}\n{}') l1 = r1.name.split(None, 1) l2 = r2.name.split(None, 1) print( mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1], share_list[1], r2.sequence)) mes = ('Unique kmer in ref:\t{}\n' 'Total pair:\t{}\n' 'Both primers uniq:\t{}\n' 'Pair with forward uniq:\t{}\n' 'Pair with reverse uniq:\t{}') print(mes.format(n_unique1, total_pair, pair, forward, reverse), file=sys.stderr)