def test_save_load(tabletype): kh = tabletype(5, PRIMES_1m) savefile = utils.get_temp_filename('tablesave.out') # test add(dna) x = kh.add("ATGGC") z = kh.get("ATGGC") assert z == 1 kh.save(savefile) # should we provide a single load function here? yes, probably. @CTB if tabletype == _Countgraph: loaded = khmer.load_countgraph(savefile) elif tabletype == _Counttable: loaded = khmer.load_counttable(savefile) elif tabletype == _SmallCountgraph: loaded = khmer.load_countgraph(savefile, small=True) elif tabletype == _SmallCounttable: loaded = khmer.load_counttable(savefile, small=True) elif tabletype == _Nodegraph: loaded = khmer.load_nodegraph(savefile) elif tabletype == _Nodetable: loaded = khmer.load_nodetable(savefile) else: raise Exception("unknown tabletype") z = loaded.get('ATGGC') assert z == 1
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = khmer.load_countgraph(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def main(): info("count-median.py", ["diginorm"]) args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print("loading k-mer countgraph from", htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print("writing to", output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(["name", "median", "average", "stddev", "seqlen"]) for record in screed.open(input_filename): seq = record.sequence.upper() if "N" in seq: seq = seq.replace("N", "A") if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('-- settings:') print('N THREADS', WORKER_THREADS) print('--') print('making hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): hashfile = sys.argv[1] filename = sys.argv[2] figure = sys.argv[3] ht = khmer.load_countgraph(hashfile) outabund = open(os.path.basename(filename) + '.counts', 'w') counts = [] d = {} for sequence in open(sys.argv[2]): sequence = sequence.strip() count = ht.get(sequence) counts.append(count) d[count] = d.get(count, 0) + 1 if count > 1000: print(sequence, count, file=outabund) outfp = open(figure + '.countshist', 'w') sofar = 0 sofar_cumu = 0 for k in sorted(d.keys()): sofar += d[k] sofar_cumu += k * d[k] print(k, d[k], sofar, sofar_cumu, file=outfp) hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000)) savefig(figure)
def test_load_into_counting_1(): in1 = utils.get_test_data("test-abund-read-2.fa") out1 = utils.get_temp_filename("out.ct") cmd = """ cat {in1} | {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) (status, out, err) = run_shell_cmd(cmd) assert os.path.exists(out1) khmer.load_countgraph(out1)
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. try: ht = khmer.load_countgraph(loadpath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_abund_dist_gz_bigcount_compressed_first(): infile = utils.copy_test_data('test-abund-read-2.fa') script = 'load-into-counting.py' htfile = utils.get_temp_filename('test_ct.gz') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = gzip.open(htfile, 'rb').read() # read compressed bigcount table outfile = utils.get_temp_filename('test_ct') f_out = open(outfile, 'wb') # output the bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table try: countgraph = khmer.load_countgraph(outfile) except OSError as err: assert 0, 'Should not produce OSError: ' + str(err) assert countgraph.n_occupied() != 0 hashsizes = countgraph.hashsizes() kmer_size = countgraph.ksize() tracking = khmer._Nodegraph(kmer_size, hashsizes) abundances = countgraph.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print(_, i) if _ > 255 and i > 0: flag = True break assert flag
def test_badload(): try: countgraph = khmer.load_countgraph() assert 0, "this should fail" except TypeError as err: print(str(err))
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print('hashtable from', args.input_count_graph_filename, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_count_graph_filename) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i + kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def test_load_into_counting_1(): in1 = utils.get_test_data('test-abund-read-2.fa') out1 = utils.get_temp_filename('out.ct') cmd = """ cat {in1} | {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) run_shell_cmd(cmd) assert os.path.exists(out1) khmer.load_countgraph(out1)
def main(): info('count-kmers.py', ['counting']) args = get_parser().parse_args() print ('hashtable from', args.input_count_graph_filename, file=sys.stderr) countgraph = khmer.load_countgraph( args.input_count_graph_filename) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) if args.output_file is None: args.output_file = sys.stdout writer = csv.writer(args.output_file) for filename in args.input_sequence_filenames: for record in screed.open(filename): seq = record.sequence.replace('N', 'A') for i in range(len(seq) - kmer_size + 1): kmer = seq[i:i+kmer_size] if not tracking.get(kmer): tracking.count(kmer) writer.writerow([kmer, str(countgraph.get(kmer))]) print ('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer countgraph from', htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print('writing to', output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(): info('filter-abund.py', ['counting']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading countgraph:', args.input_graph, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def test_counting_gz_file_type_check(): inpath = utils.get_test_data('goodversion-k12.ht.gz') try: kh = khmer.load_countgraph(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_load_gz_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') try: hi = khmer.load_countgraph(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('tempnodegraphsave0.htable') try: hi = khmer.load_countgraph(savepath) assert 0, "load should fail" except OSError: pass
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print('file with ht: %s' % counting_ht) print('loading hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() xxxfp = None print("K:", K) # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) pct = dev / avg * 100 xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name)) if random.randint(1, med) > args.coverage or pct > 100: return None, None return name, seq # the filtering loop for infile in infiles: print('filtering', infile) xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w') outfile = os.path.basename(infile) + '.medpctfilt' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(infile)): if n % 100000 == 0: print('...', n) name, seq = process_fn(record) if name and seq: print('>%s\n%s' % (name, seq), file=outfp) print('output in', outfile)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '--min-coverage', type=int, default=None) parser.add_argument('-M', '--max-coverage', type=int, default=None) parser.add_argument('input_count_graph') parser.add_argument('input_readfile') parser.add_argument('output_readfile') args = parser.parse_args() print('min_coverage: %s' % args.min_coverage, file=sys.stderr) print('max_coverage: %s' % args.max_coverage, file=sys.stderr) if not (args.min_coverage or args.max_coverage): print("neither min nor max coverage specified!? exiting!", file=sys.stderr) sys.exit(1) if args.min_coverage and args.max_coverage and \ args.max_coverage < args.min_coverage: print("min_coverage > max_coverage!? exiting!", file=sys.stderr) sys.exit(1) htable = khmer.load_countgraph(args.input_count_graph) output_file = args.output_readfile output_fp = open(output_file, 'w') n_kept = 0 n = 0 for n, record in enumerate(screed.open(args.input_readfile)): if n % 100000 == 0: print('...', n, n_kept, file=sys.stderr) seq = record.sequence.upper() seq = seq.replace('N', 'A') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue keep = True if args.min_coverage and med < args.min_coverage: keep = False if args.max_coverage and med > args.max_coverage: keep = False if keep: n_kept += 1 output_fp.write(output_single(record)) print('consumed %d reads; kept %d' % (n, n_kept), file=sys.stderr)
def main(): parser = khmer_args.build_counting_args( "Correct reads against an already-computed table", citations=['counting', 'SeqAn']) parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('-o', '--output', dest='output_file', help="output file for histogram; defaults to " "<first filename>.corr in cwd.", type=khFileType('w'), default=None) parser.add_argument('counts_table') parser.add_argument('readfile') args = parser.parse_args() print('loading counts') ht = khmer.load_countgraph(args.counts_table) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) print("trusted:", args.trusted_cov) corrfp = args.output_file if not corrfp: outfile = os.path.basename(args.readfile) + '.corr' corrfp = open(outfile, 'w') n_corrected = 0 for n, read in enumerate(screed.open(args.readfile)): if n % 10000 == 0: print('...', n, n_corrected, file=sys.stderr) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(seq) if not truncated: graph_seq = graph_alignment.replace("-", "") if graph_seq != seq: n_corrected += 1 seq = graph_seq corrfp.write(output_single(read, seq))
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(): files = sys.argv[2:] total_reads = len(files) * [0] n_consumed = len(files) * [0] n_seq_kept = len(files) * [0] print('loading ht') ht = khmer.load_countgraph(sys.argv[1]) for i, infile in enumerate(files): print('outputting', infile + '.freq') ht.output_fasta_kmer_pos_freq(infile, infile + ".freq")
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def main(): args = get_parser().parse_args() infiles = [args.input_count_graph_filename] + args.input_sequence_filenames for infile in infiles: check_input_files(infile, False) counts = khmer.load_countgraph(args.input_count_graph_filename) results = find_N_most_abundant_kmers(args.input_sequence_filenames, args.N, counts) results_df = pd.DataFrame({'kmer': [str(k) for k in results.keys()], 'count': [int(c) for c in results.values()]}) results_df.sort_values(by='count', inplace=True, ascending=False) results_df.to_csv(args.output, index=False)
def test_load_truncated(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('save.ht') truncpath = utils.get_temp_filename('trunc.ht') sizes = khmer.get_n_primes_near_x(3, 200) hi = khmer._Countgraph(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) data = open(savepath, 'rb').read() for i in range(len(data)): fp = open(truncpath, 'wb') fp.write(data[:i]) fp.close() try: khmer.load_countgraph(truncpath) assert 0, "this should not be reached!" except OSError as err: print(str(err))
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('sequences') parser.add_argument('-C', '--cutoff', default=3, type=int) parser.add_argument('--coverage', default=20, type=int) parser.add_argument('-V', '--variable', default=False, action='store_true') parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() kh = khmer.load_countgraph(args.table) n_skipped_variable = 0 n_total = 0 print >>sys.stderr, "K:", kh.ksize() print >>sys.stderr, "CUTOFF:", args.cutoff if args.variable: print >>sys.stderr, "variable coverage flag set;" print >>sys.stderr, "NORMALIZE_LIMIT:", args.coverage else: print >>sys.stderr, "assuming even coverage - no -V" for n, record in enumerate(screed.open(args.sequences)): if n % 100000 == 0: print >>sys.stderr, '...', n seq = record.sequence.upper().replace('N', 'A') n_total += 1 varskip = False if args.variable: med, _, _ = kh.get_median_count(seq) if med < args.coverage: varskip = True n_skipped_variable += 1 name = record.name.split()[0] if varskip: print >>args.outfile, name, 'V' else: #posns = find_spectral_error_positions(kh, seq, args.cutoff) posns = kh.find_spectral_error_positions(seq, args.cutoff) posns = add_n_posns(posns, record.sequence) print >>args.outfile, name, ",".join(map(str, posns)) if args.variable: sys.stderr.write('Skipped %d reads of %d total due to -V\n' % \ (n_skipped_variable, n_total))
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(): hashfile = sys.argv[1] filename = sys.argv[2] outfile = os.path.basename(filename) print('loading kh file', hashfile) ht = khmer.load_countgraph(hashfile) x = ht.fasta_count_kmers_by_position(filename, 100, 1) write_dist(x, open(outfile + '.pos.abund=1', 'w')) print('wrote', outfile + '.pos.abund=1') y = ht.fasta_count_kmers_by_position(filename, 100, 255) write_dist(y, open(outfile + '.pos.abund=255', 'w')) print('wrote', outfile + '.pos.abund=255')
def do_test(ctfile): print('working with', ctfile) inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.Countgraph(12, 1e5, 4) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3886, orig_count assert loaded_count == orig_count, loaded_count
def test_save_load_occupied_small(ctfile): print('working with', ctfile) inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) orig = khmer.SmallCountgraph(12, 1e5, 4) orig.consume_seqfile(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath, small=True) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3886, orig_count assert loaded_count == orig_count, loaded_count
def main(): parser = argparse.ArgumentParser( description="Output k-mer abundance distribution.") parser.add_argument('hashname') parser.add_argument('seqfile') parser.add_argument('histout') args = parser.parse_args() hashfile = args.hashname seqfile = args.seqfile histout = args.histout outfp = open(histout, 'w') print('hashtable from', hashfile) ht = khmer.load_countgraph(hashfile) hist = {} for i in range(65536): hist[i] = 0 for n, record in enumerate(screed.open(seqfile)): if n > 0 and n % 100000 == 0: print('...', n) seq = record.sequence.replace('N', 'A') try: med, _, _ = ht.get_median_count(seq) except ValueError: continue hist[med] = hist[med] + 1 histlist = list(hist.items()) histlist.sort() maxk = max(hist.keys()) sumk = sum(hist.values()) sofar = 0 for n, m in histlist: sofar += m percent = float(sofar) / sumk outfp.write('%d %d %d %.3f\n' % (n, m, sofar, percent)) outfp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('-o', '--output', dest='output_file', help="output file for histogram; defaults to " "<first filename>.errhist in cwd.", type=argparse.FileType('w'), default=None) parser.add_argument('counts_table') parser.add_argument('readfile') args = parser.parse_args() print('loading counts') ht = khmer.load_countgraph(args.counts_table) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) print("trusted:", args.trusted_cov) corrfp = args.output_file if not corrfp: outfile = os.path.basename(args.readfile) + '.corr' corrfp = open(outfile, 'w') n_corrected = 0 for n, read in enumerate(screed.open(args.readfile)): if n % 10000 == 0: print('...', n, n_corrected, file=sys.stderr) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(seq) if not truncated: graph_seq = graph_alignment.replace("-", "") if graph_seq != seq: n_corrected += 1 seq = graph_seq corrfp.write(output_single(read, seq))
def test_save_load_large(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_seqfile(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000) orig = khmer._Countgraph(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) loaded = khmer.load_countgraph(savepath) orig_count = orig.n_occupied() loaded_count = loaded.n_occupied() assert orig_count == 3966, orig_count assert loaded_count == orig_count, loaded_count
def test_normalize_by_median_no_bigcount(): infile = utils.copy_test_data('test-abund-read-2.fa') hashfile = utils.get_temp_filename('test-out.ct') in_dir = os.path.dirname(infile) _make_counting(infile, K=8) script = 'normalize-by-median.py' args = ['-C', '1000', '-k 8', '--savegraph', hashfile, infile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0, (out, err) print((out, err)) assert os.path.exists(hashfile), hashfile kh = khmer.load_countgraph(hashfile) assert kh.get('GGTTGACG') == 255
def test_normalize_by_median_no_bigcount(): infile = utils.get_temp_filename('test.fa') hashfile = utils.get_temp_filename('test-out.ct') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) _make_counting(infile, K=8) script = 'normalize-by-median.py' args = ['-C', '1000', '-k 8', '--savegraph', hashfile, infile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0, (out, err) print((out, err)) assert os.path.exists(hashfile), hashfile kh = khmer.load_countgraph(hashfile) assert kh.get('GGTTGACG') == 255
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print('file with ht: %s' % counting_ht) print('loading hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() print("K:", K) # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile)
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4 ** 4, 4) kh.set_use_bigcount(True) for _ in range(0, 1000): kh.count('AAAA') c = kh.get('AAAA') savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) c = kh.get('AAAA') assert c == 1000, "should be able to count to 1000: %d" % c assert c != MAX_COUNT, c
def test_normalize_by_median_no_bigcount(): infile = utils.get_temp_filename("test.fa") hashfile = utils.get_temp_filename("test-out.ct") outfile = infile + ".keep" in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data("test-abund-read-2.fa"), infile) counting_ht = _make_counting(infile, K=8) script = "normalize-by-median.py" args = ["-C", "1000", "-k 8", "--savegraph", hashfile, infile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0, (out, err) print((out, err)) assert os.path.exists(hashfile), hashfile kh = khmer.load_countgraph(hashfile) assert kh.get("GGTTGACG") == 255
def test_nobigcount_save(): kh = khmer.Countgraph(4, 4 ** 4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) # set_use_bigcount should still be False after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == MAX_COUNT
def test_load_gz_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.Countgraph(12, 1000, 2) hi.consume_seqfile(inpath) hi.save(savepath) fp = open(savepath, 'rb') data = fp.read() fp.close() fp = open(savepath, 'wb') fp.write(data[:1000]) fp.close() try: hi = khmer.load_countgraph(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_nobigcount_save(): kh = khmer.Countgraph(4, 4**4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) # set_use_bigcount should still be False after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == MAX_COUNT
def test_load_gz_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') hi = khmer.Countgraph(12, 1000, 2) hi.consume_fasta(inpath) hi.save(savepath) fp = open(savepath, 'rb') data = fp.read() fp.close() fp = open(savepath, 'wb') fp.write(data[:1000]) fp.close() try: hi = khmer.load_countgraph(savepath) assert 0, "load should fail" except OSError as e: print(str(e))
def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. kh = khmer.Countgraph(4, 4**4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) try: kh = khmer.load_countgraph(savepath) except OSError as err: assert 0, "Should not produce an OSError: " + str(err) # set_use_bigcount should still be True after load (i.e. should be saved) assert kh.get('AAAA') == 0 for _ in range(0, 1000): kh.count('AAAA') kh.get('AAAA') assert kh.get('AAAA') == 1000
def test_save_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave2.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) hi = khmer._Countgraph(12, sizes) hi.consume_seqfile(inpath) hi.save(savepath) try: ht = khmer.load_countgraph(savepath) except OSError as err: assert 0, 'Should not produce an OSError: ' + str(err) tracking = khmer._Nodegraph(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._Nodegraph(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)