def normalize_by_median_and_check(input_filename, htable, single_output_file, fail_save, paired, force, norm, report_fp=None): total = 0 discarded = 0 total_acc = None discarded_acc = None if single_output_file: if single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = single_output_file.name outfp = single_output_file else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') with CatchIOErrors(input_filename, outfp, fail_save, htable, force, norm): for record in norm(input_filename, paired): write_record(record, outfp) total = norm.total discarded = norm.discarded if report_fp: print(str(total) + " " + str(total - discarded) + " " + str(1. - (discarded / float(total))), file=report_fp) report_fp.flush() return norm.total, norm.discarded, norm.corrupt_files
def do_write(self, outfp): outq = self.outqueue while self.worker_count > 0 or not outq.empty(): try: g = outq.get(True, 1) except queue.Empty: continue for name, seq, qual in g.seqlist: if qual: record = screed.Record(name=name, sequence=seq, quality=qual) else: record = screed.Record(name=name, sequence=seq) write_record(record, outfp) if self.verbose: print("DONE writing.\nprocessed %d / wrote %d / removed %d" % (self.n_processed, self.n_written, self.n_processed - self.n_written), file=sys.stderr) print("processed %d bp / wrote %d bp / removed %d bp" % (self.bp_processed, self.bp_written, self.bp_processed - self.bp_written), file=sys.stderr) discarded = self.bp_processed - self.bp_written f = float(discarded) / float(self.bp_processed) * 100 print("discarded %.1f%%" % f, file=sys.stderr)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('making hashtable') ht = Countgraph.load(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, force_single=True) for n, is_pair, read1, read2 in paired_iter: name = read1.name seq = read1.sequence if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: write_record(screed.Record(name=name, sequence=trim_seq), outfp)
def do_write(self, outfp): outq = self.outqueue while self.worker_count > 0 or not outq.empty(): try: grouping = outq.get(True, 1) except queue.Empty: continue for name, seq, qual in grouping.seqlist: if qual: record = screed.Record(name=name, sequence=seq, quality=qual) else: record = screed.Record(name=name, sequence=seq) write_record(record, outfp) if self.verbose: print("DONE writing.\nprocessed %d / wrote %d / removed %d" % (self.n_processed, self.n_written, self.n_processed - self.n_written), file=sys.stderr) print("processed %d bp / wrote %d bp / removed %d bp" % (self.bp_processed, self.bp_written, self.bp_processed - self.bp_written), file=sys.stderr) discarded = self.bp_processed - self.bp_written percent = float(discarded) / float(self.bp_processed) * 100 print("discarded %.1f%%" % percent, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() print('fastq from ', args.input_sequence, file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) n_count = 0 for n, record in enumerate(screed.open(args.input_sequence)): if n % 10000 == 0: print('...', n, file=sys.stderr) sequence = record['sequence'] if 'N' in sequence: if not args.n_keep: n_count += 1 continue del record['quality'] write_record(record, outfp) print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr) if not args.n_keep: print(str(n_count) + ' lines dropped.', file=sys.stderr) else: print('No lines dropped from file.', file=sys.stderr) print('Wrote output to', describe_file_handle(args.output), file=sys.stderr)
def main(args): print('[kevlar::mutate] loading mutations', file=args.logfile) mutations = load_mutations(kevlar.open(args.mutations, 'r'), args.logfile) print('[kevlar::mutate] mutating genome', file=args.logfile) for record in mutate_genome(args.genome, mutations): write_record(record, kevlar.open(args.out, 'w'))
def main(): args = sanitize_help(get_parser()).parse_args() outfp = get_file_writer(args.output, args.gzip, args.bzip) for filename in args.input_filenames: for record in screed.open(filename): if len(record['sequence']) >= args.length: write_record(record, outfp) print('wrote to: ' + args.output.name, file=sys.stderr)
def main(): args = get_parser().parse_args() outfp = open(args.output, 'w') for filename in args.input_filenames: for record in screed.open(filename, parse_description=False): if len(record['sequence']) >= args.length: write_record(record, outfp) print >> sys.stderr, 'wrote to: ' + args.output
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def main(args): fastq = kevlar.open(args.out, 'w') refr = None if args.refr: print('[kevlar::dump] Loading reference sequence', file=args.logfile) refrstream = kevlar.open(args.refr, 'r') refr = kevlar.seqio.parse_seq_dict(refrstream) for read in dump(args.reads, refr, logstream=args.logfile): write_record(read, fastq)
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 write_record(read1, args.output) write_record(read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def pass1(self, reader, saver): """ The first pass across the read data. It does the following: 1. If do_normalize is set, discard all read pairs with coverage above DIGINORM_COVERAGE. 2. For each remaining read pair, check if the read pair is above the coverage necessary for trimming (TRIM_AT_COVERAGE). If so, k-mer trim the reads at CUTOFF, and yield them. 3. If the read pair is not at the coverage necessary for trimming, consume the read pair with the graph and save the read pair for the second pass. """ graph = self.graph TRIM_AT_COVERAGE = self.trim_at_coverage CUTOFF = self.cutoff DIGINORM_COVERAGE = self.diginorm_coverage K = graph.ksize() for n, is_pair, read1, read2 in reader: bundle = ReadBundle(read1, read2) # clean up the sequences for examination. self.n_reads += bundle.n_reads self.n_bp += bundle.n_bp min_coverage = min(bundle.coverages(graph)) if self.do_normalize and min_coverage >= DIGINORM_COVERAGE: # skip reads if normalizing continue # trim? if min_coverage >= TRIM_AT_COVERAGE: for read, cleaned_read in bundle.both(): record, did_trim = do_trim_read(graph, read, cleaned_read, CUTOFF) if did_trim: self.trimmed_reads += 1 if record: yield record # no, too low coverage to trim; consume & set aside for 2nd pass. else: for read, cleaned_read in bundle.both(): graph.consume(cleaned_read) write_record(read, saver) self.n_saved += 1
def pass1(self, reader, saver): """ The first pass across the read data. It does the following: 1. If do_normalize is set, discard all read pairs with coverage above DIGINORM_COVERAGE. 2. For each remaining read pair, check if the read pair is above the coverage necessary for trimming (TRIM_AT_COVERAGE). If so, k-mer trim the reads at CUTOFF, and yield them. 3. If the read pair is not at the coverage necessary for trimming, consume the read pair with the graph and save the read pair for the second pass. """ graph = self.graph TRIM_AT_COVERAGE = self.trim_at_coverage CUTOFF = self.cutoff DIGINORM_COVERAGE = self.diginorm_coverage K = graph.ksize() for n, is_pair, read1, read2 in reader: bundle = ReadBundle(read1, read2) # clean up the sequences for examination. self.n_reads += bundle.num_reads self.n_bp += bundle.total_length min_coverage = min(bundle.coverages(graph)) if self.do_normalize and min_coverage >= DIGINORM_COVERAGE: # skip reads if normalizing continue # trim? if min_coverage >= TRIM_AT_COVERAGE: for read in bundle.reads: record, did_trim = trim_record(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 if record: yield record # no, too low coverage to trim; consume & set aside for 2nd pass. else: for read in bundle.reads: graph.consume(read.cleaned_seq) write_record(read, saver) self.n_saved += 1
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def process_unassigned(self, outfp=None): """ Process unassigned reads. Can optionally output said reads if outfp is given Also develops counts of partition IDs--necessary for further processing """ with PartitionedReader(self.file_list) as reader: for read, pid in reader: self.count[pid] = self.count.get(pid, 0) + 1 if pid == 0: self.n_unassigned += 1 if outfp: write_record(read, outfp)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE defaults with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0 and index: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): write_record(record, fp_out1) counter1 += 1 elif name.endswith('/2'): write_record(record, fp_out2) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_input_files(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >>sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >> sys.stderr, 'reading file "%s"' % args.infile print >> sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >> sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >> sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_seqfile(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_fasta(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): info('unique-kmers.py', ['SeqAn', 'hll']) args = get_parser().parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) report_fp = args.report input_filename = None for index, input_filename in enumerate(args.input_filenames): hllcpp = khmer.HLLCounter(args.error_rate, args.ksize) for record in screed.open(input_filename): seq = record.sequence.upper().replace('N', 'A') hllcpp.consume_string(seq) if args.stream_out: write_record(record, sys.stdout) cardinality = hllcpp.estimate_cardinality() print('Estimated number of unique {0}-mers in {1}: {2}'.format( args.ksize, input_filename, cardinality), file=sys.stderr) if report_fp: print(cardinality, args.ksize, '(total)', file=report_fp) report_fp.flush() total_hll.merge(hllcpp) cardinality = total_hll.estimate_cardinality() print('Total estimated number of unique {0}-mers: {1}'.format( args.ksize, cardinality), file=sys.stderr) to_print = graphsize_args_report(cardinality, args.error_rate) if args.diagnostics: print(to_print, file=sys.stderr) if report_fp: print(cardinality, args.ksize, 'total', file=report_fp) print(to_print, file=report_fp) report_fp.flush()
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_first and args.output_second): print("Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(ReadParser(infile), require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): info('trim-low-abund.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadtable: print >>sys.stderr, 'loading k-mer counting table from', args.loadtable ct = khmer.load_counting_hash(args.loadtable) else: print >>sys.stderr, 'making k-mer counting table' ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) trimfilename = os.path.basename(filename) + '.abundtrim' pass2list.append((filename, pass2filename, trimfilename)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') trimfp = open(trimfilename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() trimfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. trimfp = open(trimfilename, 'a') for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print 'read %d reads, %d bp' % (n_reads, n_bp,) print 'wrote %d reads, %d bp' % (written_reads, written_bp,) print 'looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes) print 'removed %d reads and trimmed %d reads (%.2f%%)' % \ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) print 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print '%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov) print 'skipped %d reads/%d bases because of low coverage' % \ (skipped_n, skipped_bp) fp_rate = khmer.calc_expected_collisions(ct) print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1) print 'output in *.abundtrim' if args.savetable: print >>sys.stderr, "Saving k-mer counting table to", args.savetable ct.save(args.savetable)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be if args.output_file: output_filename = args.output_file.name if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: print("NOTE: This can be overridden using the --force" " argument", file=sys.stderr) sys.exit(1) else: filename = args.filenames[0] if filename in ('/dev/stdin', '-'): print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) screed_iter = screed.open(filename) for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( screed_iter, force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) output_file = args.output_file if not output_file: output_file = open(output_filename, 'wb') output_file = get_file_writer(output_file, args.gzip, args.bzip) for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = get_file_writer(open(n_filename, 'wb'), args.gzip, args.bzip) for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): args = sanitize_help(get_parser()).parse_args() infile = args.infile check_input_files(infile, args.force) check_space([infile], args.force) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" infile = '-' if not (args.output_paired and args.output_single): print("Accepting input from stdin; output filenames must be " "provided.", file=sys.stderr) sys.exit(1) elif args.output_dir: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) out1 = args.output_dir + '/' + os.path.basename(infile) + '.se' out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe' else: out1 = os.path.basename(infile) + '.se' out2 = os.path.basename(infile) + '.pe' # OVERRIDE default output file locations with -p, -s if args.output_paired: paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip) out2 = paired_fp.name else: # Don't override, just open the default filename from above paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) if args.output_single: single_fp = get_file_writer(args.output_single, args.gzip, args.bzip) out1 = args.output_single.name else: # Don't override, just open the default filename from above single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) print('reading file "%s"' % infile, file=sys.stderr) print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr) print('outputting orphans to "%s"' % out1, file=sys.stderr) n_pe = 0 n_se = 0 screed_iter = ReadParser(infile) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print('DONE; read %d sequences,' ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) print('wrote to: %s and %s' % (out2, out1), file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def normalize_by_median(input_filename, outfp, htable, args, report_fp=None): desired_coverage = args.cutoff ksize = htable.ksize() # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 index = -1 total = 0 discarded = 0 for index, batch in enumerate( batchwise(screed.open(input_filename, parse_description=False), batch_size)): if index > 0 and index % 100000 == 0: print >>sys.stderr, '... kept {kept} of {total} or'\ ' {perc:2}%'.format(kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print >> sys.stderr, '... in file', input_filename if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not check_is_pair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < ksize: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = htable.get_median_count(seq) if med < desired_coverage: htable.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: write_record(record, outfp) else: discarded += batch_size if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() return total, discarded
#!/usr/bin/env python from __future__ import print_function from khmer.utils import write_record import screed import sys mutations = { 0: (42681, 10), } for n, record in enumerate(screed.open(sys.argv[1])): if n in mutations: start, dellength = mutations[n] seqlength = len(record.sequence) piece1 = record.sequence[:start] piece2 = record.sequence[start + dellength:] record.sequence = piece1 + piece2 print('DEBUG ', piece1[-9:], '|', piece2[:9], sep='', file=sys.stderr) assert len(record.sequence) == seqlength - dellength write_record(record, sys.stdout)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): info('correct-reads.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: check_space_for_graph( args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph ct = khmer.load_countgraph(args.loadgraph) else: print >>sys.stderr, 'making k-mer countgraph' ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print >>sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,) print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \ (n_reads - written_reads, corrected_reads, percent_reads_corrected) print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >>sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >>sys.stderr, 'output in *.corr' if args.savegraph: print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph ct.save(args.savegraph)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >>sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}' .format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >> sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >> sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >> sys.stderr, 'opening', filename, 'for reading' for record in screed.open(filename, parse_description=False): total += 1 if total % 10000 == 0: print >> sys.stderr, '...', total, 'reads scanned' if total >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads' % \ args.max_reads, '(see -M); exiting' break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: write_record(record, output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: write_record(record, output_file)
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile, args.force) check_space(args.part_filenames, args.force) print >>sys.stderr, '---' print >>sys.stderr, 'reading partitioned files:', repr(args.part_filenames) if args.output_groups: print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \ args.prefix print >>sys.stderr, 'min reads to keep a partition:', \ args.min_part_size print >>sys.stderr, 'max size of a group file:', args.max_size else: print >>sys.stderr, 'NOT outputting groups! Beware!' if args.output_unassigned: print >>sys.stderr, \ 'outputting unassigned reads to "%s.unassigned.fa"' % \ args.prefix print >>sys.stderr, 'partition size distribution will go to %s' \ % distfilename print >>sys.stderr, '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'accuracy'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'accuracy'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'accuracy'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print >>sys.stderr, '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: write_record(read, unassigned_fp) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print >>sys.stderr, '%d groups' % group_n if group_n == 0: print >>sys.stderr, 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print >>sys.stderr, '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] write_record(read, outfp) part_seqs += 1 print >>sys.stderr, '---' print >>sys.stderr, 'Of %d total seqs,' % total_seqs print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \ part_seqs print >>sys.stderr, \ 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \ n_unassigned print >>sys.stderr, '' print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \ (len(group_fps), args.prefix, suffix)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( "`reservoir sampling\n" "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.", "reservoir sampling algorithm. " "http://en.wikipedia.org/wiki/Reservoir_sampling") args = sanitize_help(parser).parse_args() for name in args.filenames: check_input_files(name, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be if args.output_file: output_filename = args.output_file.name if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: print( "NOTE: This can be overridden using the --force" " argument", file=sys.stderr) sys.exit(1) else: filename = args.filenames[0] if filename in ('/dev/stdin', '-'): print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) output_filename = os.path.basename(filename) + '.subset' filename = args.filenames[0] if filename in ('/dev/stdin', '-'): # seqan only treats '-' as "read from stdin" filename = '-' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for _ in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) for count, (_, _, rcrd1, rcrd2) in enumerate( broken_paired_reader(ReadParser(filename), force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for sample in range(num_samples): reads[sample].append((rcrd1, rcrd2)) else: for sample in range(num_samples): assert len(reads[sample]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) output_file = args.output_file if not output_file: output_file = open(output_filename, 'wb') output_file = get_file_writer(output_file, args.gzip, args.bzip) for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = get_file_writer(open(n_filename, 'wb'), args.gzip, args.bzip) for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >>sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >>sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >>sys.stderr, 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print >>sys.stderr, '...', total, 'reads scanned' if total >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads',\ ' (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: write_record(record, output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: write_record(record, output_file)
def normalize_by_median(input_filename, outfp, htable, paired, cutoff, report_fp=None): desired_coverage = cutoff ksize = htable.ksize() # In paired mode we read two records at a time batch_size = 1 if paired: batch_size = 2 index = -1 total = 0 discarded = 0 for index, batch in enumerate(batchwise(screed.open( input_filename, parse_description=False), batch_size)): if index > 0 and index % 100000 == 0: print >>sys.stderr, '... kept {kept} of {total} or'\ ' {perc:2}%'.format(kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print >>sys.stderr, '... in file', input_filename if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if paired: if not check_is_pair(batch[0], batch[1]): raise IOError('Error: Improperly interleaved pairs \ {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name)) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < ksize: passed_length = False continue seq = record.sequence.replace('N', 'A') med, _, _ = htable.get_median_count(seq) if med < desired_coverage: htable.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: write_record(record, outfp) else: discarded += batch_size if report_fp: print >> report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() return total, discarded
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** IOErrors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('split-paired-reads.py') args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): if not (args.output_first and args.output_second): print( "Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None screed_iter = screed.open(infile) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter, require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.read1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print('Subsampling %d reads using reservoir sampling.' % args.num_reads, file=sys.stderr) print('Subsampled reads will be placed in %s' % output_filename, file=sys.stderr) print('', file=sys.stderr) else: # > 1 print('Subsampling %d reads, %d times,' % (args.num_reads, num_samples), ' using reservoir sampling.', file=sys.stderr) print('Subsampled reads will be placed in %s.N' % output_filename, file=sys.stderr) print('', file=sys.stderr) reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print('opening', filename, 'for reading', file=sys.stderr) screed_iter = screed.open(filename, parse_description=False) for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader( screed_iter, force_single=args.force_single)): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: print('reached upper limit of %d reads' % args.max_reads, '(see -M); exiting', file=sys.stderr) break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print('Writing %d sequences to %s' % (len(reads[0]), output_filename), file=sys.stderr) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print('Writing %d sequences to %s' % (len(reads[n]), n_filename), file=sys.stderr) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): info('trim-low-abund.py', ['streaming']) parser = get_parser() args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadtable: print >> sys.stderr, 'loading k-mer counting table from', args.loadtable ct = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: trimfp = open(os.path.basename(filename) + '.abundtrim', 'w') else: trimfp = args.out pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print >> sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print >> sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >> sys.stderr, 'read %d reads, %d bp' % ( n_reads, n_bp, ) print >> sys.stderr, 'wrote %d reads, %d bp' % ( written_reads, written_bp, ) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed) print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >> sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >> sys.stderr, 'output in *.abundtrim' if args.savetable: print >> sys.stderr, "Saving k-mer counting table to", args.savetable ct.save(args.savetable)
def main(): # pylint: disable=too-many-branches,too-many-statements start_time = time.time() parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph1 = Countgraph.load(args.loadgraph) # load second counting table. if args.loadgraph2: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph2) countgraph2 = Countgraph.load(args.loadgraph2) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for _, is_paired, read0, read1 in reader: for record in snarf(is_paired, read0, read1, countgraph1, countgraph2): if record is not None: write_record(record, outfp) print("--- %s seconds ---" % (time.time() - start_time))
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >> sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') args = parser.parse_args() force_single = args.force_single #if args.reads == '-': # args.reads = sys.stdin # check that input files exist check_valid_file_exists(args.input_filenames) filenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # create object of Nodetable in Khmer to use its kh = khmer.Nodetable(args.ksize, 1, 1) # load database mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list def readFusion(read): global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion flag = None lf_ids = set() rt_ids = set() families = [] shared_kmers = [] gaps = [] hashvals = kh.get_kmer_hashes(read.sequence) # find a matching k-mer at the beginning of the read lf = hashvals[0] lf_ids = get_kmer_to_family_ids(lf) idx = 1 while idx < len(hashvals) and len(lf_ids) == 0: lf = hashvals[idx] lf_ids = get_kmer_to_family_ids(lf) idx += 1 if len(lf_ids) == 0: #print('no single match') n_unmatched += 1 flag = "unmatched" elif idx == len(hashvals): #print('same, only last kmer matched') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # len(lf_ids) > 0 & idx < len(hashvals) # find a matching k-mer at the end of the read rt = hashvals[-1] rt_ids = get_kmer_to_family_ids(rt) idy = len(hashvals) - 2 while idy >= idx and len(rt_ids) == 0: rt = hashvals[idy] rt_ids = get_kmer_to_family_ids(rt) idy -= 1 if len(rt_ids) == 0: #print('same, only one non-last kmer matched ') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: intersect_ids = lf_ids.intersection(rt_ids) if len(intersect_ids) > 0: families.append(intersect_ids) if len(intersect_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # fusion to be resolved shared_kmer = 1 gap_size = 0 gap = False while idx <= idy + 1: temp = hashvals[idx] temp_ids = get_kmer_to_family_ids(temp) if len(temp_ids) > 0: intersect_ids = lf_ids.intersection(temp_ids) if len(intersect_ids) > 0: lf_ids = intersect_ids shared_kmer += 1 gap_size = 0 else: # len(intersect_ids) == 0 families.append(lf_ids) shared_kmers.append(shared_kmer) lf_ids = temp_ids shared_kmer = 1 gaps.append(gap_size) gap_size = 0 else: gap_size += 1 idx += 1 families.append(lf_ids) shared_kmers.append(shared_kmer) assert len(families) > 1 if len(families) == 2: if len(families[0]) == 1 and len(families[1]) == 1: n_clear_fusion += 1 flag = "clear_fusion" else: n_ambig_fusion += 1 flag = "ambig_fusion" else: # len(families) > 2 n_mutli_fusion += 1 flag = "multi_fusion" #if len(families) == 0: # families = "-" #if len(shared_kmers) == 0: # shared_kmers = "-" return flag, families, shared_kmers, gaps fusion_filename = args.database + '_fusion.fa' fusion_fp = open(fusion_filename, 'w') fusionInfo_filename = args.database + '_fusion.info' fusionInfo_fp = open(fusionInfo_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "gene_families", "shared_kmers", "gaps", file=fusionInfo_fp, sep='\t') fusionCalc_filename = args.database + '_fusion.calc' fusionCalc_fp = open(fusionCalc_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "familiy_A", "familiy_B", "no_families", "len_families", "shared_kmers", "gaps", "sorted_keys", file=fusionCalc_fp, sep='\t') fusionPairs_filename = args.database + '_fusionPairs.fa' fusPair_fp = open(fusionPairs_filename, 'w') fusionPairsInfo_filename = args.database + '_fusionPairs.info' fusPairInfo_fp = open(fusionPairsInfo_filename, 'w') print("fileName", "recordIndex", "fusion_class", "R1_family", "R2_family", file=fusPairInfo_fp, sep='\t') fusionPairsCalc_filename = args.database + '_fusionPairs.calc' fusPairCalc_fp = open(fusionPairsCalc_filename, 'w') print("fileName", "recordIndex", "fusion_class", "familiy_A", "familiy_B", "len_families", "sorted_keys", file=fusPairCalc_fp, sep='\t') corrupt_files = [] family_names = dict(zip(family_ids.values(), family_ids.keys())) n = 0 n_paired_fusion = 0 sameRef = ("unique", "ambiguous") fusion = ("clear_fusion", "ambig_fusion", "multi_fusion") for filename, require_paired in files: with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp, fusPair_fp, fusPairInfo_fp, fusPairCalc_fp, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) for r_index, is_paired, read0, read1 in reader: n += 1 if n % 10000 == 0: print('...', n) #if n > 5000: # break flag0, families0, shared_kmers0, gaps0 = readFusion(read0) if not is_paired and flag0 in fusion: #families_names0 = [] #for gp in families0: # gp_names = [] # for family_id in gp: # family_name = family_names[family_id] # gp_names.append(family_name) # families_names0.append(gp_names) print(filename, r_index, "single", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) #i = 1 #while i < len(families0): # for g1 in families0[i-1]: # for g2 in families0[i]: # print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]), # shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t') # i += 1 i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "single", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if is_paired: flag1, families1, shared_kmers1, gaps1 = readFusion(read1) if flag0 in fusion or flag1 in fusion: print(filename, r_index, "Read_1", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) print(filename, r_index, "Read_2", flag1, families1, shared_kmers1, gaps1, file=fusionInfo_fp, sep='\t') write_record(read1, fusion_fp) if flag0 in fusion: i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "Read_1", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if flag1 in fusion: i = len(families1) - 1 for g1 in families1[0]: g1_name = family_names[g1] for g2 in families1[i]: g2_name = family_names[g2] print(filename, r_index, "Read_2", flag1, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families1), [len(f) for f in families1], shared_kmers1, gaps1, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') elif flag0 in sameRef and flag1 in sameRef: if len(families0[0].intersection(families1[0])) == 0: n_paired_fusion += 1 if flag0 == "unique" and flag1 == "unique": fusion_class = "clear_fusion" else: fusion_class = "ambig_fusion" print(filename, r_index, fusion_class, families0, families1, file=fusPairInfo_fp, sep='\t') write_record(read0, fusPair_fp) write_record(read1, fusPair_fp) for g1 in families0[0]: g1_name = family_names[g1] for g2 in families1[0]: g2_name = family_names[g2] print(filename, r_index, fusion_class, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), [ len(f) for f in (families0[0], families1[0]) ], sorted([g1, g2]), file=fusPairCalc_fp, sep='\t') print('No of input fragments: ', n) print('unmatched:', n_unmatched) print('Unique:', n_same) print('Ambiguous:', n_amb_same) print('Single read clear fusion:', n_clear_fusion) print('Single read ambiguous fusion:', n_ambig_fusion) print('Single read multi fusion:', n_mutli_fusion) print('paired read fusion:', n_paired_fusion)
def main(): info("sweep-reads-buffered.py", ["sweep"]) parser = sanitize_epilog(get_parser()) args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: args.max_tablesize = MAX_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, graphtype="nodegraph") K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = next(ix) del ix extension = "fa" if hasattr(record, "quality"): # fastq! extension = "fq" output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.GraphLabels(K, HT_SIZE, N_HT) try: print("consuming input sequences...", file=sys.stderr) if args.label_by_pid: print("...labeling by partition id (pid)", file=sys.stderr) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print("...labeling by sequence", file=sys.stderr) for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print("...consumed {n} sequences...".format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print("...labeling to create groups of size {s}".format(s=args.group_size), file=sys.stderr) label = -1 g = 0 try: outfp = open("{pref}_base_{g}.{ext}".format(pref=output_pref, g=g, ext=extension), "wb") for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open("{pref}_base_{g}.{ext}".format(pref=output_pref, g=g, ext=extension), "wb") if n % 50000 == 0: print("...consumed {n} sequences...".format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) except (IOError, OSError) as e: print("!! ERROR !!", e, file=sys.stderr) print("...error splitting input. exiting...", file=sys.stderr) except (IOError, OSError) as e: print("!! ERROR: !!", e, file=sys.stderr) print( "...error consuming \ {i}. exiting...".format( i=input_fastp ), file=sys.stderr, ) print( "done consuming input sequence. \ added {t} tags and {l} \ labels...".format( t=ht.graph.n_tags(), l=ht.n_labels() ) ) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print("** sweeping {read_file} for labels...".format(read_file=read_file), file=sys.stderr) file_t = 0.0 try: read_fp = screed.open(read_file) except (IOError, OSError) as error: print("!! ERROR: !!", error, file=sys.stderr) print("*** Could not open {fn}, skipping...".format(fn=read_file), file=sys.stderr) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print( "\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)".format( n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t ), file=sys.stderr, ) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, "quality"): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, "multi") n_mlabeled += 1 label_dict["multi"] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, "orphaned") label_dict["orphaned"] += 1 print("** End of file {fn}...".format(fn=read_file), file=sys.stderr) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print("** End of run...", file=sys.stderr) output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print("! WARNING: Sweep finished with errors !", file=sys.stderr) print("** {writee} reads not written".format(writee=output_buffer.num_write_errors), file=sys.stderr) print("** {filee} errors opening files".format(filee=output_buffer.num_file_errors), file=sys.stderr) print("swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned), file=sys.stderr) print("...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned), file=sys.stderr) print("...and {nmc} multilabeled".format(nmc=n_mlabeled), file=sys.stderr) print("** outputting label number distribution...", file=sys.stderr) fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref)) with open(fn, "w", encoding="utf-8") as outfp: for nc in label_number_dist: outfp.write("{nc}\n".format(nc=nc)) fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref)) print("** outputting label read counts...", file=sys.stderr) with open(fn, "w", encoding="utf-8") as outfp: for k in label_dict: outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.output is None: trimfp = get_file_writer(open(os.path.basename(filename) + '.abundtrim', 'wb'), args.gzip, args.bzip) else: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: args.max_tablesize = MAX_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='nodegraph') K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = next(ix) del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print('consuming input sequences...', file=sys.stderr) if args.label_by_pid: print('...labeling by partition id (pid)', file=sys.stderr) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print('...labeling by sequence', file=sys.stderr) for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print('...labeling to create groups of size {s}'.format( s=args.group_size), file=sys.stderr) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels( record.sequence, label) write_record(record, outfp) except (IOError, OSError) as e: print('!! ERROR !!', e, file=sys.stderr) print('...error splitting input. exiting...', file=sys.stderr) except (IOError, OSError) as e: print('!! ERROR: !!', e, file=sys.stderr) print('...error consuming \ {i}. exiting...'.format(i=input_fastp), file=sys.stderr) print('done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.graph.n_tags(), l=ht.n_labels())) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print('** sweeping {read_file} for labels...'.format( read_file=read_file), file=sys.stderr) file_t = 0.0 try: read_fp = screed.open(read_file) except (IOError, OSError) as error: print('!! ERROR: !!', error, file=sys.stderr) print('*** Could not open {fn}, skipping...'.format(fn=read_file), file=sys.stderr) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print('\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t), file=sys.stderr) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print('** End of run...', file=sys.stderr) output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print('! WARNING: Sweep finished with errors !', file=sys.stderr) print('** {writee} reads not written'.format( writee=output_buffer.num_write_errors), file=sys.stderr) print('** {filee} errors opening files'.format( filee=output_buffer.num_file_errors), file=sys.stderr) print('swept {n_reads} for labels...'.format(n_reads=n_labeled + n_orphaned), file=sys.stderr) print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled, no=n_orphaned), file=sys.stderr) print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr) print('** outputting label number distribution...', file=sys.stderr) fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'w', encoding='utf-8') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print('** outputting label read counts...', file=sys.stderr) with open(fn, 'w', encoding='utf-8') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))