def test_readaligner_load(): ct = khmer.Countgraph(32, 1048576, 1) parameters_json = utils.get_test_data('readaligner-default.json') a_aligner = khmer.ReadAligner(ct, 0, 0, filename=parameters_json) a_scoring_matrix = a_aligner.get_scoring_matrix() a_transition_probabilities = a_aligner.get_transition_probabilities() assert a_scoring_matrix[0] == -0.06642736173897607, a_scoring_matrix[0] assert a_transition_probabilities[0][0] == -0.021973842014145723, ( a_transition_probabilities[0][0]) for seq in ht_seqs: ct.consume(seq) for query in queries: a_aligner.align(query['seq']) b_aligner = khmer.ReadAligner( ct, 0, 0, transition_probabilities=a_transition_probabilities, scoring_matrix=a_scoring_matrix) b_scoring_matrix = b_aligner.get_scoring_matrix() b_transition_probabilities = b_aligner.get_transition_probabilities() assert b_scoring_matrix == a_scoring_matrix, (a_scoring_matrix, b_scoring_matrix) assert b_transition_probabilities == a_transition_probabilities, ( a_transition_probabilities, b_transition_probabilities)
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, 5, 1.0) for record in screed.open(args.ref): s = record.sequence s = s.replace('N', 'A') score, graph_alignment, read_alignment, truncated = \ aligner.align(s) assert not truncated g = graph_alignment.replace('-', '') r = read_alignment.replace('-', '') print record.name for kstart in range(0, len(g) - ct.ksize() + 1): kmer = g[kstart:kstart + ct.ksize()] print kstart, ct.get(kmer)
def test_readalign_new(query): ch = khmer.Countgraph(32, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for seq in ht_seqs: ch.consume(seq) check_query(aligner, query)
def test_readalign_new(): return # @CTB ch = khmer.Countgraph(32, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for seq in ht_seqs: ch.consume(seq) for query in queries: if "description" in query: check_query.description = query["description"] yield check_query, aligner, query
def test_alignnocov(): ch = khmer.new_counting_hash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.ReadAligner(ch, 0, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") score, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
def main(): parser = khmer_args.build_counting_args( "Correct reads against an already-computed table", citations=['counting', 'SeqAn']) parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('-o', '--output', dest='output_file', help="output file for histogram; defaults to " "<first filename>.corr in cwd.", type=khFileType('w'), default=None) parser.add_argument('counts_table') parser.add_argument('readfile') args = parser.parse_args() print('loading counts') ht = Countgraph.load(args.counts_table) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) print("trusted:", args.trusted_cov) corrfp = args.output_file if not corrfp: outfile = os.path.basename(args.readfile) + '.corr' corrfp = open(outfile, 'w') n_corrected = 0 for n, read in enumerate(screed.open(args.readfile)): if n % 10000 == 0: print('...', n, n_corrected, file=sys.stderr) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(seq) if not truncated: graph_seq = graph_alignment.replace("-", "") if graph_seq != seq: n_corrected += 1 seq = graph_seq corrfp.write(output_single(read, seq))
def test_align_middle(): ch = khmer.Countgraph(10, 1048576, 1) read = "TCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, trusted_cov_cutoff=0, bits_theta=0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume(read) _, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, read) eq_(graphAlign, read) assert not trunc
def test_align_fwd_middle(): ch = khmer.Countgraph(10, 1048576, 1) read = "TCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume(read) score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read) # should be the same eq_(readAlign, read) eq_(graphAlign, read) assert not trunc
def test_readalign_new(): ch = khmer.new_counting_hash(32, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for seq in ht_seqs: ch.consume(seq) for query in queries: score, graphAlign, readAlign, trunc = aligner.align(query["seq"]) print graphAlign print readAlign eq_(graphAlign, query["graph_aln"]) eq_(readAlign, query["read_aln"]) eq_(trunc, query["truncated"])
def test_alignnocov(): ch = khmer.Countgraph(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") ch.consume("ACCTAGGTTCGACATGTACC") _, graphAlign, readAlign, trunc = aligner.align(read) # should be the same eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'ACCTAGGTTCGACATGTACC') assert not trunc
def test_readalign(): ch = khmer.new_counting_hash(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") read = "ACCTAGGTTCGACATGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") score, graphAlign, readAlign, trunc = aligner.align(read) eq_(readAlign, 'ACCTAGGTTCGACATGTACc') eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
def test_align_nothing(): ch = khmer.Countgraph(10, 1048576, 1) read = "ACCAAGGCTCGAGATTTACC" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") score, graphAlign, readAlign, trunc = aligner.align(read) print(score, graphAlign, readAlign) assert trunc assert len(graphAlign) == 0 assert len(readAlign) == 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') parser.add_argument('--trusted', type=int, default=5) parser.add_argument('--variants-out', type=str, default='variants.txt', dest='variants_out') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, args.trusted, 1.0) for record in screed.open(args.ref): seq = record.sequence seq = seq.replace('N', 'A') score, alignment = align_long(ct, aligner, seq) g = alignment.g r = alignment.r m, n = alignment.compare() print record.name, m, n, n - m, "%.3f%%" % (float(m) / n * 100) for start in range(0, len(alignment), 60): print start print alignment[start:start + 60] gidx = AlignmentIndex(alignment) fp = open(args.variants_out, 'w') for gi, a, b in alignment.variants(): kmer = '' pos = gi while len(kmer) < ct.ksize() and pos < len(alignment.g): ch = alignment.g[pos] pos += 1 if ch in '=-': continue kmer += ch if alignment.covs[gi]: print >> fp, gi, a, b, gidx.get_ri( gi), kmer, alignment.covs[gi] if 0: print len(seq), alignment.refseqlen() gidx._sanityCheck(seq)
def test_readalign(): return # @CTB ch = khmer.Countgraph(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") read = "ACCTAGGTTCGACATGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") score, graphAlign, readAlign, _ = aligner.align(read) eq_(readAlign, 'ACCTAGGTTCGACATGTACC') eq_(graphAlign, 'AGCTAGGTTCGACAAGTCCT')
def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.ReadAligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq
def test_align_fwd_middle_trunc_2(): ch = khmer.Countgraph(10, 1048576, 1) read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT") # omit prefix from graph ch.consume(read[12:]) _, graphAlign, readAlign, trunc, _ = aligner.align_forward(read) # this will fail, because align_forward chooses the first kmer as the # seed. assert not readAlign assert not graphAlign assert trunc
def test_simple_readalign(): ch = khmer.CountingHash(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 2, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT") read = "ACCTAGGTTCGACAAGTACC" # ^^ ^ ^ ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG") # CCCGGGCCTTTGTCGAACCTTTTTAAAAGC score, graphAlign, readAlign, trunc = aligner.align(read) # AGCTAGGTTCGACAAGT CCT # ACCTAGGTTCGACAAGTaCC # --CTAGGTTCGACATGT-CC eq_(graphAlign, 'AGCTAGGTTCGACATGTCC-') eq_(readAlign, 'ACCTAGGTTCGACAAGTACc')
def main(): parser = build_counting_args() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", type=float, default=1.0) parser.add_argument("input_table") parser.add_argument("input_filenames", nargs="+") add_loadhash_args(parser) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print >> sys.stderr, 'file with ht: %s' % counting_ht print >> sys.stderr, 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() aligner = khmer.ReadAligner( ht, args.trusted_cov, args.theta ) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts) ### the filtering loop for infile in infiles: print >> sys.stderr, 'aligning', infile for n, record in enumerate(screed.open(infile)): name = record['name'] seq = record['sequence'].upper() print >> sys.stderr, name print >> sys.stderr, seq score, graph_alignment, read_alignment, truncated = aligner.align( seq) print >> sys.stderr, score print >> sys.stderr, graph_alignment print >> sys.stderr, read_alignment print >> sys.stderr, truncated print ">{0}\n{1}".format(name, graph_alignment)
def main(): hash_filename = sys.argv[1] input_filename = sys.argv[2] output_filename = sys.argv[3] max_error_region = int(sys.argv[4]) C = 20 # 20 corrected = 0 uncorrected = 0 outfp = open(output_filename, 'w') ht = khmer.load_counting_hash(hash_filename) aligner = khmer.ReadAligner(ht, 1, C, max_error_region) K = ht.ksize() for n, record in enumerate(screed.open(input_filename)): if n % 1000 == 0: print n seq = record.sequence seq_name = record.name seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') corrected += 1 outfp.write('>%s\n%s\n' % (seq_name, graph_seq)) else: uncorrected += 1 outfp.write('>%s\n%s\n' % (seq_name, seq)) print 'corrected', corrected print 'uncorrected', uncorrected outfp.close()
def test_align_fwd_middle_trunc(): ch = khmer.Countgraph(10, 1048576, 1) read = "TCGACAAGTCCTTGACAGATGGGGGG" aligner = khmer.ReadAligner(ch, 0, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") # omit suffix from graph ch.consume(read[:-5]) score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read) # should not be the same... neq_(readAlign, read) neq_(graphAlign, read) eq_(readAlign, read[:-5]) eq_(graphAlign, read[:-5]) # ...but truncated assert trunc
def test_align_middle_trunc_2(): ch = khmer.Countgraph(10, 1048576, 1) read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, 0, 0) for _ in range(20): ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT") # omit prefix from graph ch.consume(read[12:]) _, graphAlign, readAlign, trunc = aligner.align(read) # here, the alignment must start not at the beginning print(readAlign) print(graphAlign) eq_(readAlign, read[12:]) eq_(graphAlign, read[12:]) # ...but truncated assert trunc
def test_align_fwd_covs_1(): K = 10 ch = khmer.Countgraph(K, 1048576, 1) read = "GTCGACAAGTCCTTGACAGAT" aligner = khmer.ReadAligner(ch, 0, 0) for i in range(19): ch.consume(read) ch.consume("CTCGACAAGTCCTTGACAGAT") # ^ score, g, r, is_t, covs = aligner.align_forward(read) for start in range(0, len(read) - K + 1): print(ch.get(read[start:start + K]), end=' ') print('') assert len(covs) == len(read) assert covs[0] == 19 assert min(covs[1:-K]) == 20, covs assert max(covs) == 20, covs
def test_2(): ct = khmer.new_counting_hash(20, 1.1e6, 4) ct.consume_fasta('simple-haplo-reads.fa.keep') aligner = khmer.ReadAligner(ct, 5, 1.0) seq = "".join("""GTCCTGGCGGTCCCCATTCA CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT GTAGGAAGTCACATGTGCTAAATATCAG TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT """.strip().split()) seq = list(screed.open('simplefoo.fa'))[0].sequence score, alignment = align_long(ct, aligner, seq) print len(seq), alignment.refseqlen() for start in range(0, len(alignment), 60): print alignment[start:start+60] gidx = AlignmentIndex(alignment) gidx._sanityCheck(seq)
def main(): parser = argparse.ArgumentParser() parser.add_argument('table') parser.add_argument('ref') args = parser.parse_args() ct = khmer.load_counting_hash(args.table) aligner = khmer.ReadAligner(ct, 5, 1.0) for record in screed.open(args.ref): s = record.sequence s = s.replace('N', 'A') score, graph_alignment, read_alignment, truncated = \ aligner.align(s) #assert not truncated g = graph_alignment #.replace('-', '') r = read_alignment #.replace('-', '') line1 = [] line2 = [] line3 = [] for n, (a, b) in enumerate(zip(g, r)): line1.append(a) line3.append(b) if a != b: line2.append(' ') else: line2.append('|') print '::', record.name, score, truncated for start in range(0, len(line1), 60): print "".join(line1[start:start + 60]) print "".join(line2[start:start + 60]) print "".join(line3[start:start + 60]) print '--'
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument( "--bits-theta", help= "Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_tables print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_tablesize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_tables * args.min_tablesize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align( record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" .format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 if total: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling" "trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.max_tablesize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % \ (args.n_tables * args.max_tablesize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables filenames = args.input_filenames if args.loadhash: print('loading hashtable from', args.loadhash) ht = khmer.load_countgraph(args.loadhash) else: print('making hashtable') ht = khmer.Countgraph(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out is not None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepvar' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print('... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%') print('... in file', input_filename) total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(record.sequence) # next, decide whether or to keep it. keep = False if truncated: keep = True # keep all truncated alignments - why? else: # build a better sequence -- this is the corrected one. graph_seq = graph_alignment.replace("-", "") # OR? #graph_seq = "" #for i in range(len(graph_alignment)): # if graph_alignment[i] == "-": # graph_seq += read_alignment[i] # else: # graph_seq += graph_alignment[i] # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: keep = True if details_out is not None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: " "{6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" "".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if total: print('DONE with', input_filename, \ '; kept', total - discarded, 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%') print('output in', output_name) if args.savehash: print('Saving hashfile through', input_filename) print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate)
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print >>sys.stderr, \ "Error: Cannot input the same filename multiple times." sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: check_space_for_graph( args.n_tables * args.min_tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph ct = khmer.load_countgraph(args.loadgraph) else: print >>sys.stderr, 'making k-mer countgraph' ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print >>sys.stderr, 'created temporary directory %s; ' \ 'use -T to change location' % tempdir aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print >>sys.stderr, '...', n, filename, save_pass2, \ n_reads, n_bp, written_reads, written_bp # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \ % (filename, save_pass2, n, filename) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print >>sys.stderr, ('second pass: looking at sequences kept aside ' 'in %s') % pass2filename # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print >>sys.stderr, '... x 2', n, pass2filename, \ written_reads, written_bp seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print >>sys.stderr, 'removing %s' % pass2filename os.unlink(pass2filename) print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,) print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,) print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \ (save_pass2_total, n_passes) print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \ (n_reads - written_reads, corrected_reads, percent_reads_corrected) print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \ (n_reads - skipped_n, percent_reads_hicov) print >>sys.stderr, ('skipped %d reads/%d bases because of low' 'coverage') % (skipped_n, skipped_bp) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >>sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) print >>sys.stderr, 'output in *.corr' if args.savegraph: print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph ct.save(args.savegraph)
def test_graph_attribute(): ch = khmer.Countgraph(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 0, 0) assert aligner.graph is ch
def main(): parser = argparse.ArgumentParser(description='XXX') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE) parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize', default=env_hashsize, help='lower bound on hashsize to use') parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on median k-mer abundance of this', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--tempdir', '-T', type=str, dest='tempdir', default='./') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes NORMALIZE_LIMIT = args.normalize_to print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print 'created temporary directory %s; use -T to change location' % tempdir ### save_pass2 = 0 n_aligned = 0 n_corrected = 0 total_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) corrfilename = os.path.basename(filename) + '.corr' pass2list.append((filename, pass2filename, corrfilename)) pass2fp = open(pass2filename, 'w') corrfp = open(corrfilename, 'w') for n, read in enumerate(screed.open(filename)): total_reads += 1 if n % 10000 == 0: print '...', n, filename, n_aligned, n_corrected, save_pass2, \ total_reads seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) # next, decide whether or to keep it. output_corrected = False if not truncated: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if graph_seq != read.sequence: n_corrected += 1 # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: output_corrected = True # has this portion of the graph saturated? if not, # consume & save => pass2. if output_corrected: corrfp.write(output_single(read, corrected)) else: # uncorrected... ht.consume(read.sequence) pass2fp.write(output_single(read, read.sequence)) save_pass2 += 1 pass2fp.close() corrfp.close() print '%s: kept aside %d of %d from first pass, in %s' % \ (filename, save_pass2, n, filename) print 'aligned %d of %d reads so far' % (n_aligned, total_reads) print 'changed %d of %d reads so far' % (n_corrected, total_reads) for orig_filename, pass2filename, corrfilename in pass2list: print 'second pass: looking at sequences kept aside in %s' % \ pass2filename for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print '... x 2', n, pass2filename, n_aligned, n_corrected, \ total_reads corrfp = open(corrfilename, 'a') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(read.sequence) if truncated: # no good alignment; output original corrected = read.sequence else: n_aligned += 1 # build a better sequence -- this is the corrected one. if True: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] corrected = graph_seq if corrected != read.sequence: n_corrected += 1 corrfp.write(output_single(read, corrected)) print 'removing %s' % pass2filename os.unlink(pass2filename) print 'removing temp directory & contents (%s)' % tempdir shutil.rmtree(tempdir) print 'Aligned %d of %d total' % (n_aligned, total_reads) print 'Changed %d of %d total' % (n_corrected, total_reads)
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference') parser.add_argument('readfile') args = parser.parse_args() ct = khmer.new_counting_hash(21, 1e7, 4) tags_to_positions = {} references = {} for record in screed.open(args.reference): # store for later retrieval - in memory, for now. references[record.name] = record.sequence # load into graph & tag ct.consume_and_tag(record.sequence) # track positions in reference by tag tagposns = ct.get_tags_and_positions(record.sequence) for pos, tag in tagposns: x = tags_to_positions.get(tag, []) x.append((record.name, pos)) tags_to_positions[tag] = x # now, walk through the reads and map to graph aligner = khmer.ReadAligner(ct, 0, 1.0) for read in screed.open(args.readfile): # align to graph, where possible readseq = read.sequence.replace('N', 'A') score, g, r, truncated = aligner.align(readseq) if truncated: print >> sys.stderr, "IGNORING read", read.name continue # find locations in reference where read alignment overlaps a tag refseq = g.replace('-', '') ptags = ct.get_tags_and_positions(refseq) assert len(ptags) refposns = [] for pos, tag in ptags: refposns.extend(tags_to_positions[tag]) # extract the larger region, remap read to get exact positions regions = turn_locations_into_regions(refposns) for (ref, start, end) in regions: # pull out reference region referenceseq = references[ref] start = max(start - REGIONSIZE / 2, 0) end = min(end + REGIONSIZE / 2, len(referenceseq)) regionseq = referenceseq[start:end] # align region back to read nct = khmer.new_counting_hash(21, 1e5, 4) nct.consume(readseq) naligner = khmer.ReadAligner(nct, 1, 1.0) score, galign = graphAlignment.align_long(nct, naligner, regionseq) for n, (a, b) in enumerate(galign): if a != '=': break o = len(galign) while 1: (a, b) = galign[o - 1] if a != '=': break o -= 1 if '=' in galign[n:o].g: assert 0 gidx = graphAlignment.AlignmentIndex(galign) print 'Read %s aligns to %s[%s:%s]' % (read.name, ref, start + n, start + o) print galign[n:o]