def run_bwa_py_sampe_alloc_only(refseq_fname, read_fname, mate_fname): read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina') mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina') pairs = [x for x in it.izip(read_flow, mate_flow)] print_meminfo("AFTER READING PAIRS") bwsa = bwa.build_bws_array(pairs) print_meminfo("AFTER BUILDING BWSA") bwts = bwa.restore_index(refseq_fname) print_meminfo("AFTER RESTORING INDEX") bnsp, pacseq = bwa.restore_reference(refseq_fname) print_meminfo("AFTER RESTORING REFERENCE") gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t() last_ii.avg = -1.0 l = len(pairs) print_meminfo("AFTER INIT OPT & II") # deallocate seq & ref data for i in 0, 1: bwa.free_seq(l, bwsa[i]) bwa.bwt_destroy(bwts[i]) bwa.bns_destroy(bnsp) print_meminfo("AFTER DEALLOC") del pacseq n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) print_meminfo("AFTER DEL PACSEQ") del pairs n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) print_meminfo("AFTER DEL PAIRS")
def build_bws_array_with_trim_qual_zero(self): pairs = self.load_pairs_fixture("pairs.txt") bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=0) for read in 0,1: for i in xrange(len(pairs)): seq = bws_array[read][i] self.assertEqual(seq.full_len, seq.clip_len, "unexpected sequence trimming")
def process_sequences(bwts, bns, pacseq, seq_reader, N, analyze_seqs=None): gopt = bwa.gap_init_opt() popt = bwa.pe_init_opt() ii = bwa.isize_info_t() last_ii = bwa.isize_info_t() last_ii.avg = -1.0 while 1: pairs = read_seq_pairs(seq_reader, N) seq_pairs_read = len(pairs) if seq_pairs_read == 0: break bwsa = bwa.build_bws_array(pairs) bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[0], gopt) bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[1], gopt) cnt_chg = bwa.cal_pac_pos_pe(bwts, seq_pairs_read, bwsa, ii, popt, gopt, last_ii) sys.stderr.write('ii: %r\n' %[ii.avg, ii.std, ii.low, ii.high]) bwa.paired_sw(bns, pacseq, seq_pairs_read, bwsa, popt, ii) bwa.refine_gapped(bns, seq_pairs_read, bwsa[0], pacseq) bwa.refine_gapped(bns, seq_pairs_read, bwsa[1], pacseq) analyze_seqs(gopt[0], bns, seq_pairs_read, bwsa) bwa.free_seq(N, bwsa[0]) bwa.free_seq(N, bwsa[1])
def process_sequences(bwts, bns, pacseq, seq_reader, N, analyze_seqs=None): gopt = bwa.gap_init_opt() popt = bwa.pe_init_opt() ii = bwa.isize_info_t() last_ii = bwa.isize_info_t() last_ii.avg = -1.0 while 1: pairs = read_seq_pairs(seq_reader, N) seq_pairs_read = len(pairs) if seq_pairs_read == 0: break bwsa = bwa.build_bws_array(pairs) bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[0], gopt) bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[1], gopt) cnt_chg = bwa.cal_pac_pos_pe(bwts, seq_pairs_read, bwsa, ii, popt, gopt, last_ii) sys.stderr.write('ii: %r\n' % [ii.avg, ii.std, ii.low, ii.high]) bwa.paired_sw(bns, pacseq, seq_pairs_read, bwsa, popt, ii) bwa.refine_gapped(bns, seq_pairs_read, bwsa[0], pacseq) bwa.refine_gapped(bns, seq_pairs_read, bwsa[1], pacseq) analyze_seqs(gopt[0], bns, seq_pairs_read, bwsa) bwa.free_seq(N, bwsa[0]) bwa.free_seq(N, bwsa[1])
def build_bws_array_with_trim_qual(self): pairs = self.load_pairs_fixture("pairs.txt") bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=15) sequences = [bws_array[read][i] for i in xrange(5) for read in 0, 1] clipped = map(lambda seq: seq.full_len > seq.clip_len, sequences) self.assertTrue(any(clipped))
def build_bws_array_with_trim_qual_zero(self): pairs = self.load_pairs_fixture("pairs.txt") bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=0) for read in 0, 1: for i in xrange(len(pairs)): seq = bws_array[read][i] self.assertEqual(seq.full_len, seq.clip_len, "unexpected sequence trimming")
def build_bws_array(self): # FIXME: plenty of UGLY code nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown, pe=True) bioseq_pairs, qseq_pairs = [], [] for i, read_pair in enumerate(g): base_name = "foo-%d" % i names = ["%s/%d" % (base_name, j) for j in 1, 2] seq_strings = ["".join(read_pair[j][0]) for j in 0, 1] q_strings = [ "".join( chr(x + sg.Q_OFFSET[fmt]) for x in read_pair[j][1]) for j in 0, 1 ] bioseq_p = [ SeqRecord(Seq(seq_strings[j], single_letter_alphabet), id=names[j], name=names[j], description=names[j]) for j in 0, 1 ] for j in 0, 1: bioseq_p[j].letter_annotations[qkey] = read_pair[j][1] bioseq_pairs.append(bioseq_p) qseq_pairs.append([base_name]) for j in 0, 1: for l in seq_strings, q_strings: qseq_pairs[-1].append(l[j]) n = len(bioseq_pairs[0][0]) assert len(qseq_pairs[0][1]) == n for src in "bioseq", "qseq": seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src) for j in 0, 1: self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t) for i in xrange(nseq): bwseq = bwsa[j][i] self.assertEqual(bwseq.len, n) if src == "bioseq": exp_name = bioseq_pairs[i][0].name else: exp_name = "%s/%d" % (qseq_pairs[i][0], (j + 1)) self.assertEqual(len(bwseq.get_name()), len(exp_name))
def main(argv): try: refseq_fname = argv[1] read_fname = argv[2] mate_fname = argv[3] except IndexError: sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0]) seq_list_len = 5000 max_isize = pairing_batch_size = 1000 gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina') mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina') pairs_flow = it.izip(read_flow, mate_flow) res = [] while 1: pairs = list(it.islice(pairs_flow, 0, seq_list_len)) if len(pairs) == 0: break bwts = bwa.restore_index(refseq_fname) bnsp, pacseq = bwa.restore_reference(refseq_fname) l = len(pairs) bwsa = bwa.build_bws_array(pairs) logger = logging.getLogger("test") logger.setLevel(logging.DEBUG) counters = get_counters() ctx = ContextStub() visitor = MRVisitor(logger, ctx, counters) bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize, pairing_batch_size, visitor) for read, mate in bwa_iterator.analyze(bwsa, l): print read.get_name(), mate.get_name() for j in 0, 1: bwa.free_seq(l, bwsa[j]) bwa.bns_destroy(bwa_iterator.bnsp) for cn, c in counters.iteritems(): sys.stderr.write("%s = %d\n" % (cn, c.value))
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname): read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina') mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina') pairs = [x for x in it.izip(read_flow, mate_flow)] print_meminfo("AFTER READING PAIRS") bwsa = bwa.build_bws_array(pairs) print_meminfo("AFTER BUILDING BWSA") bwts = bwa.restore_index(refseq_fname) print_meminfo("AFTER RESTORING INDEX") bnsp, pacseq = bwa.restore_reference(refseq_fname) print_meminfo("AFTER RESTORING REFERENCE") gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t() last_ii.avg = -1.0 l = len(pairs) print_meminfo("AFTER INIT OPT & II") bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt) bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt) print_meminfo("AFTER CAL_SA_REG_GAP") cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii) print_meminfo("AFTER CAL_PAC_POS_PE") bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii) print_meminfo("AFTER PAIRED_SW") bwa.refine_gapped(bnsp, l, bwsa[0], pacseq) bwa.refine_gapped(bnsp, l, bwsa[1], pacseq) print_meminfo("AFTER REFINE_GAPPED") for k in xrange(l): v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k]) v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k]) print_meminfo("AFTER ANALYZE_HIT") # deallocate seq & ref data for i in 0, 1: bwa.free_seq(l, bwsa[i]) bwa.bwt_destroy(bwts[i]) bwa.bns_destroy(bnsp) print_meminfo("AFTER DEALLOC") del pacseq n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) print_meminfo("AFTER DEL PACSEQ") del pairs n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) print_meminfo("AFTER DEL PAIRS")
def build_bws_array(self): # FIXME: plenty of UGLY code nseq = 100 for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa": qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality" for unknown in False, True: g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown, pe=True) bioseq_pairs, qseq_pairs = [], [] for i, read_pair in enumerate(g): base_name = "foo-%d" % i names = ["%s/%d" % (base_name, j) for j in 1, 2] seq_strings = ["".join(read_pair[j][0]) for j in 0, 1] q_strings = ["".join(chr(x+sg.Q_OFFSET[fmt]) for x in read_pair[j][1]) for j in 0, 1] bioseq_p = [SeqRecord(Seq(seq_strings[j], single_letter_alphabet), id=names[j], name=names[j], description=names[j]) for j in 0, 1] for j in 0, 1: bioseq_p[j].letter_annotations[qkey] = read_pair[j][1] bioseq_pairs.append(bioseq_p) qseq_pairs.append([base_name]) for j in 0, 1: for l in seq_strings, q_strings: qseq_pairs[-1].append(l[j]) n = len(bioseq_pairs[0][0]) assert len(qseq_pairs[0][1]) == n for src in "bioseq", "qseq": seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src) for j in 0, 1: self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t) for i in xrange(nseq): bwseq = bwsa[j][i] self.assertEqual(bwseq.len, n) if src == "bioseq": exp_name = bioseq_pairs[i][0].name else: exp_name = "%s/%d" % (qseq_pairs[i][0], (j+1)) self.assertEqual(len(bwseq.get_name()), len(exp_name))
def main(argv): try: refseq_fname = argv[1] read_fname = argv[2] mate_fname = argv[3] except IndexError: sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0]) seq_list_len = 10000 max_isize = pairing_batch_size = 10000 gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina') mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina') pairs_flow = it.izip(read_flow, mate_flow) pairs = list(it.islice(pairs_flow, 0, seq_list_len)) bwts = bwa.restore_index(refseq_fname) bnsp, pacseq = bwa.restore_reference(refseq_fname) l = len(pairs) bwsa = bwa.build_bws_array(pairs) bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize, pairing_batch_size) pairs = [p for p in bwa_iterator.analyze(bwsa, l)] print "READ POS GAPO GAPE MM STRAND SCORE CIGAR" for read, mate in pairs: if read.n_multi > 0: print multi_list = [m for m in read.itermulti()] for m in multi_list: print read.get_name(), m.pos, m.n_gapo, m.n_gape, m.n_mm, m.strand, \ m.score, m.get_cigar(read.len) for j in 0, 1: bwa.free_seq(l, bwsa[j]) bwa.bns_destroy(bwa_iterator.bnsp)
def build_bws_array_with_trim_qual(self): pairs = self.load_pairs_fixture("pairs.txt") bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=15) sequences = [ bws_array[read][i] for i in xrange(5) for read in 0,1 ] clipped = map(lambda seq: seq.full_len > seq.clip_len, sequences) self.assertTrue(any(clipped))
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, seq_list_len=None): size_list = [] resident_list = [] failed_ii = 0 read_flow = Bio.SeqIO.parse(open(read_fname), "fastq-illumina") mate_flow = Bio.SeqIO.parse(open(mate_fname), "fastq-illumina") # pairs = [x for x in it.izip(read_flow, mate_flow)] pairs_flow = it.izip(read_flow, mate_flow) while 1: pairs = list(it.islice(pairs_flow, 0, seq_list_len)) if len(pairs) == 0: break size, resident = print_meminfo("AFTER READING PAIRS") size_list.append(size) resident_list.append(resident) bwsa = bwa.build_bws_array(pairs) size, resident = print_meminfo("AFTER BUILDING BWSA") size_list.append(size) resident_list.append(resident) bwts = bwa.restore_index(refseq_fname) size, resident = print_meminfo("AFTER RESTORING INDEX") size_list.append(size) resident_list.append(resident) bnsp, pacseq = bwa.restore_reference(refseq_fname) size, resident = print_meminfo("AFTER RESTORING REFERENCE") size_list.append(size) resident_list.append(resident) gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t() last_ii.avg = -1.0 l = len(pairs) size, resident = print_meminfo("AFTER INIT OPT & II") size_list.append(size) resident_list.append(resident) bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt) bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt) size, resident = print_meminfo("AFTER CAL_SA_REG_GAP") size_list.append(size) resident_list.append(resident) cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii) size, resident = print_meminfo("AFTER CAL_PAC_POS_PE") size_list.append(size) resident_list.append(resident) # sys.stderr.write("ii=%f\n" % ii.avg) if ii.avg < 0.0: failed_ii += 1 bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii) size, resident = print_meminfo("AFTER PAIRED_SW") size_list.append(size) resident_list.append(resident) bwa.refine_gapped(bnsp, l, bwsa[0], pacseq) bwa.refine_gapped(bnsp, l, bwsa[1], pacseq) size, resident = print_meminfo("AFTER REFINE_GAPPED") size_list.append(size) resident_list.append(resident) for k in xrange(l): v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k]) v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k]) size, resident = print_meminfo("AFTER ANALYZE_HIT") size_list.append(size) resident_list.append(resident) # deallocate seq & ref data for i in 0, 1: bwa.free_seq(l, bwsa[i]) bwa.bwt_destroy(bwts[i]) bwa.bns_destroy(bnsp) size, resident = print_meminfo("AFTER DEALLOC") size_list.append(size) resident_list.append(resident) del pacseq n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) size, resident = print_meminfo("AFTER DEL PACSEQ") size_list.append(size) resident_list.append(resident) del pairs n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) size, resident = print_meminfo("AFTER DEL PAIRS") size_list.append(size) resident_list.append(resident) return max(size_list), max(resident_list), failed_ii
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, seq_list_len=None): size_list = [] resident_list = [] failed_ii = 0 read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina') mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina') #pairs = [x for x in it.izip(read_flow, mate_flow)] pairs_flow = it.izip(read_flow, mate_flow) while 1: pairs = list(it.islice(pairs_flow, 0, seq_list_len)) if len(pairs) == 0: break size, resident = print_meminfo("AFTER READING PAIRS") size_list.append(size) resident_list.append(resident) bwsa = bwa.build_bws_array(pairs) size, resident = print_meminfo("AFTER BUILDING BWSA") size_list.append(size) resident_list.append(resident) bwts = bwa.restore_index(refseq_fname) size, resident = print_meminfo("AFTER RESTORING INDEX") size_list.append(size) resident_list.append(resident) bnsp, pacseq = bwa.restore_reference(refseq_fname) size, resident = print_meminfo("AFTER RESTORING REFERENCE") size_list.append(size) resident_list.append(resident) gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt() ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t() last_ii.avg = -1.0 l = len(pairs) size, resident = print_meminfo("AFTER INIT OPT & II") size_list.append(size) resident_list.append(resident) bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt) bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt) size, resident = print_meminfo("AFTER CAL_SA_REG_GAP") size_list.append(size) resident_list.append(resident) cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii) size, resident = print_meminfo("AFTER CAL_PAC_POS_PE") size_list.append(size) resident_list.append(resident) #sys.stderr.write("ii=%f\n" % ii.avg) if ii.avg < 0.0: failed_ii += 1 bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii) size, resident = print_meminfo("AFTER PAIRED_SW") size_list.append(size) resident_list.append(resident) bwa.refine_gapped(bnsp, l, bwsa[0], pacseq) bwa.refine_gapped(bnsp, l, bwsa[1], pacseq) size, resident = print_meminfo("AFTER REFINE_GAPPED") size_list.append(size) resident_list.append(resident) for k in xrange(l): v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k]) v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k]) size, resident = print_meminfo("AFTER ANALYZE_HIT") size_list.append(size) resident_list.append(resident) # deallocate seq & ref data for i in 0, 1: bwa.free_seq(l, bwsa[i]) bwa.bwt_destroy(bwts[i]) bwa.bns_destroy(bnsp) size, resident = print_meminfo("AFTER DEALLOC") size_list.append(size) resident_list.append(resident) del pacseq n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) size, resident = print_meminfo("AFTER DEL PACSEQ") size_list.append(size) resident_list.append(resident) del pairs n_unreachable = gc.collect() logging.debug("n_unreachable = %d" % n_unreachable) size, resident = print_meminfo("AFTER DEL PAIRS") size_list.append(size) resident_list.append(resident) return max(size_list), max(resident_list), failed_ii