def main(): args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer countgraph from', htfile, file=sys.stderr) countgraph = Countgraph.load(htfile) ksize = countgraph.ksize() print('writing to', output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def test_abund_dist_gz_bigcount_compressed_first(): infile = utils.copy_test_data('test-abund-read-2.fa') script = 'load-into-counting.py' htfile = utils.get_temp_filename('test_ct.gz') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = gzip.open(htfile, 'rb').read() # read compressed bigcount table outfile = utils.get_temp_filename('test_ct') f_out = open(outfile, 'wb') # output the bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table try: countgraph = Countgraph.load(outfile) except OSError as err: assert 0, 'Should not produce OSError: ' + str(err) assert countgraph.n_occupied() != 0 hashsizes = countgraph.hashsizes() kmer_size = countgraph.ksize() tracking = khmer.Nodegraph(kmer_size, 1, 1, primes=hashsizes) abundances = countgraph.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print(_, i) if _ > 255 and i > 0: flag = True break assert flag
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer.Countgraph(12, 1, 1, primes=sizes) hi.consume_seqfile(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = Countgraph.load(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer.Nodegraph(12, 1, 1, primes=sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer.Nodegraph(12, 1, 1, primes=sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('making hashtable') ht = Countgraph.load(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, force_single=True) for n, is_pair, read1, read2 in paired_iter: name = read1.name seq = read1.sequence if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: write_record(screed.Record(name=name, sequence=trim_seq), outfp)
def test_load_into_counting_1(): in1 = utils.get_test_data('test-abund-read-2.fa') out1 = utils.get_temp_filename('out.ct') cmd = """ cat {in1} | {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) run_shell_cmd(cmd) assert os.path.exists(out1) Countgraph.load(out1)
def test_badload(): try: countgraph = Countgraph.load() assert 0, "this should fail" except TypeError as err: print(str(err))
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print ('hashtable from', args.input_count_graph_filename, file=sys.stderr) countgraph = Countgraph.load( args.input_count_graph_filename) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer.Nodegraph( # pylint: disable=protected-access kmer_size, 1, 1, primes=hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def test_counting_gz_file_type_check(): inpath = utils.get_test_data('goodversion-k12.ht.gz') try: kh = Countgraph.load(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_load_gz_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') try: hi = Countgraph.load(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('tempnodegraphsave0.htable') try: hi = Countgraph.load(savepath) assert 0, "load should fail" except OSError: pass
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') hi = khmer.Countgraph(12, 200, 3) hi.consume_seqfile(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: Countgraph.load(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def main(): parser = khmer_args.build_counting_args( "Correct reads against an already-computed table", citations=['counting', 'SeqAn']) parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('-o', '--output', dest='output_file', help="output file for histogram; defaults to " "<first filename>.corr in cwd.", type=khFileType('w'), default=None) parser.add_argument('counts_table') parser.add_argument('readfile') args = parser.parse_args() print('loading counts') ht = Countgraph.load(args.counts_table) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) print("trusted:", args.trusted_cov) corrfp = args.output_file if not corrfp: outfile = os.path.basename(args.readfile) + '.corr' corrfp = open(outfile, 'w') n_corrected = 0 for n, read in enumerate(screed.open(args.readfile)): if n % 10000 == 0: print('...', n, n_corrected, file=sys.stderr) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(seq) if not truncated: graph_seq = graph_alignment.replace("-", "") if graph_seq != seq: n_corrected += 1 seq = graph_seq corrfp.write(output_single(read, seq))
def test_save_load_large(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.Countgraph(12, 2**31, 1) orig.consume_seqfile(inpath) orig.save(savepath) loaded = Countgraph.load(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def test_save_load_occupied(ctfile): print('working with', ctfile) inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.Countgraph(12, 1e5, 4) orig.consume_seqfile(inpath) orig.save(savepath) loaded = Countgraph.load(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3886, orig_count assert loaded_count == orig_count, loaded_count
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4 ** 4, 4) kh.set_use_bigcount(True) for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_load_gz_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.Countgraph(12, 1000, 2) hi.consume_seqfile(inpath) hi.save(savepath) fp = open(savepath, 'rb') data = fp.read() fp.close() fp = open(savepath, 'wb') fp.write(data[:1000]) fp.close() try: hi = Countgraph.load(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_nobigcount_save(): kh = khmer.Countgraph(4, 4 ** 4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) # set_use_bigcount should still be False after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == MAX_COUNT
def test_nobigcount_save(): kh = khmer.Countgraph(4, 4**4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) # set_use_bigcount should still be False after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == MAX_COUNT
def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4 ** 4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) # set_use_bigcount should still be True after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == 1000
def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = Countgraph.load(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) # set_use_bigcount should still be True after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == 1000
def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer.Countgraph(12, 1, 1, primes=sizes) hi.consume_seqfile(inpath) hi.save(savepath) try: ht = Countgraph.load(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) tracking = khmer.Nodegraph(12, 1, 1, primes=sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer.Nodegraph(12, 1, 1, primes=sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### graphtype = 'countgraph' if not args.small_count else 'smallcountgraph' report_on_config(args, graphtype=graphtype) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, graphtype) check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) if args.small_count: ct = SmallCountgraph.load(args.loadgraph) else: ct = Countgraph.load(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. paired_iter = broken_paired_reader(ReadParser(filename), min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. read_parser = ReadParser(pass2filename) paired_iter = broken_paired_reader(read_parser, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) read_parser.close() log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() try: log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) except OSError as oe: log_info( 'WARNING: unable to remove {temp} (probably an NFS issue); ' 'please remove manually', temp=tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.output is None: log_info('output in *.abundtrim') elif args.output.name == 1: log_info('output streamed to stdout') elif args.output.name: log_info('output in {}'.format(args.output.name)) if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph) if args.summary_info is not None: # note that when streaming to stdout the name of args.output will # be set to 1 if args.output is not None and args.output.name != 1: base = args.output.name # no explicit name or stdout stream -> use a default name else: base = 'trim-low-abund-{}'.format( time.strftime("%Y-%m-%dT%H:%M:%S")) info = { 'fpr': fp_rate, 'reads': n_reads, 'basepairs': n_bp, 'reads_written': written_reads, 'basepairs_written': written_bp, 'reads_skipped': n_skipped, 'basepairs_skipped': bp_skipped, 'reads_removed': n_reads - written_reads, 'reads_trimmed': trimmed_reads, 'basepairs_removed_or_trimmed': n_bp - written_bp } store_provenance_info(info, fname=base, format=args.summary_info)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### graphtype = 'countgraph' if not args.small_count else 'smallcountgraph' report_on_config(args, graphtype=graphtype) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, graphtype) check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) if args.small_count: ct = SmallCountgraph.load(args.loadgraph) else: ct = Countgraph.load(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. paired_iter = broken_paired_reader(ReadParser(filename), min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. read_parser = ReadParser(pass2filename) paired_iter = broken_paired_reader(read_parser, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) read_parser.close() log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() try: log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) except OSError as oe: log_info('WARNING: unable to remove {temp} (probably an NFS issue); ' 'please remove manually', temp=tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.output is None: log_info('output in *.abundtrim') elif args.output.name == 1: log_info('output streamed to stdout') elif args.output.name: log_info('output in {}'.format(args.output.name)) if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph) if args.summary_info is not None: # note that when streaming to stdout the name of args.output will # be set to 1 if args.output is not None and args.output.name != 1: base = args.output.name # no explicit name or stdout stream -> use a default name else: base = 'trim-low-abund-{}'.format( time.strftime("%Y-%m-%dT%H:%M:%S")) info = {'fpr': fp_rate, 'reads': n_reads, 'basepairs': n_bp, 'reads_written': written_reads, 'basepairs_written': written_bp, 'reads_skipped': n_skipped, 'basepairs_skipped': bp_skipped, 'reads_removed': n_reads - written_reads, 'reads_trimmed': trimmed_reads, 'basepairs_removed_or_trimmed': n_bp - written_bp } store_provenance_info(info, fname=base, format=args.summary_info)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = [args.input_count_graph_filename, args.input_sequence_filename] for infile in infiles: check_input_files(infile, False) log_info('Loading counting graph from {graph}', graph=args.input_count_graph_filename) countgraph = Countgraph.load(args.input_count_graph_filename) if not countgraph.get_use_bigcount() and args.bigcount: log_warn("WARNING: The loaded graph has bigcount DISABLED while " "bigcount reporting is ENABLED--counts higher than 255 will " "not be reported.") countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer.Nodegraph( # pylint: disable=protected-access kmer_size, 1, 1, primes=hashsizes) log_info('K: {ksize}', ksize=kmer_size) log_info('outputting to {output}', output=args.output_histogram_filename) if args.output_histogram_filename in ('-', '/dev/stdout'): pass elif os.path.exists(args.output_histogram_filename): if not args.squash_output: log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) log_info('** squashing existing file {output}', output=args.output_histogram_filename) log_info('preparing hist...') abundances = countgraph.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) if args.output_histogram_filename in ('-', '/dev/stdout'): countgraph_fp = sys.stdout else: countgraph_fp = open(args.output_histogram_filename, 'w') countgraph_fp_csv = csv.writer(countgraph_fp) # write headers: countgraph_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break
def test_load_empty_files(ext): # Check empty files, compressed or not fname = utils.get_test_data('empty-file' + ext) with pytest.raises(OSError): Countgraph.load(fname)
def main(): # pylint: disable=too-many-branches,too-many-statements start_time = time.time() parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph1 = Countgraph.load(args.loadgraph) # load second counting table. if args.loadgraph2: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph2) countgraph2 = Countgraph.load(args.loadgraph2) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for _, is_paired, read0, read1 in reader: for record in snarf(is_paired, read0, read1, countgraph1, countgraph2): if record is not None: write_record(record, outfp) print("--- %s seconds ---" % (time.time() - start_time))
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)