コード例 #1
0
ファイル: test_bwa_memory.py プロジェクト: ilveroluca/seal
def run_bwa_py_sampe_alloc_only(refseq_fname, read_fname, mate_fname):
    read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
    mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
    pairs = [x for x in it.izip(read_flow, mate_flow)]
    print_meminfo("AFTER READING PAIRS")
    bwsa = bwa.build_bws_array(pairs)
    print_meminfo("AFTER BUILDING BWSA")
    bwts = bwa.restore_index(refseq_fname)
    print_meminfo("AFTER RESTORING INDEX")
    bnsp, pacseq = bwa.restore_reference(refseq_fname)
    print_meminfo("AFTER RESTORING REFERENCE")
    gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()
    ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t()
    last_ii.avg = -1.0
    l = len(pairs)
    print_meminfo("AFTER INIT OPT & II")
    # deallocate seq & ref data
    for i in 0, 1:
        bwa.free_seq(l, bwsa[i])
        bwa.bwt_destroy(bwts[i])
    bwa.bns_destroy(bnsp)
    print_meminfo("AFTER DEALLOC")
    del pacseq
    n_unreachable = gc.collect()
    logging.debug("n_unreachable = %d" % n_unreachable)
    print_meminfo("AFTER DEL PACSEQ")
    del pairs
    n_unreachable = gc.collect()
    logging.debug("n_unreachable = %d" % n_unreachable)
    print_meminfo("AFTER DEL PAIRS")
コード例 #2
0
ファイル: test_bwa.py プロジェクト: QwertyManiac/seal-cdh4
 def build_bws_array_with_trim_qual_zero(self):
   pairs = self.load_pairs_fixture("pairs.txt")
   bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=0)
   for read in 0,1:
     for i in xrange(len(pairs)):
       seq = bws_array[read][i]
       self.assertEqual(seq.full_len, seq.clip_len, "unexpected sequence trimming")
コード例 #3
0
def process_sequences(bwts, bns, pacseq, seq_reader, N, analyze_seqs=None):
  gopt = bwa.gap_init_opt()
  popt = bwa.pe_init_opt()

  ii      = bwa.isize_info_t()
  last_ii = bwa.isize_info_t()
  last_ii.avg = -1.0

  while 1:
    pairs = read_seq_pairs(seq_reader, N)
    seq_pairs_read = len(pairs)
    if seq_pairs_read == 0:
      break
    bwsa = bwa.build_bws_array(pairs)
    bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[0], gopt)
    bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[1], gopt)
    cnt_chg = bwa.cal_pac_pos_pe(bwts, seq_pairs_read, bwsa,
                                 ii, popt, gopt, last_ii)
    sys.stderr.write('ii: %r\n' %[ii.avg, ii.std, ii.low, ii.high])
    bwa.paired_sw(bns, pacseq, seq_pairs_read, bwsa, popt, ii)
    bwa.refine_gapped(bns, seq_pairs_read, bwsa[0], pacseq)
    bwa.refine_gapped(bns, seq_pairs_read, bwsa[1], pacseq)
    analyze_seqs(gopt[0], bns, seq_pairs_read, bwsa)
    bwa.free_seq(N, bwsa[0])
    bwa.free_seq(N, bwsa[1])
コード例 #4
0
ファイル: try_bwa_pairs.py プロジェクト: ilveroluca/seal
def process_sequences(bwts, bns, pacseq, seq_reader, N, analyze_seqs=None):
    gopt = bwa.gap_init_opt()
    popt = bwa.pe_init_opt()

    ii = bwa.isize_info_t()
    last_ii = bwa.isize_info_t()
    last_ii.avg = -1.0

    while 1:
        pairs = read_seq_pairs(seq_reader, N)
        seq_pairs_read = len(pairs)
        if seq_pairs_read == 0:
            break
        bwsa = bwa.build_bws_array(pairs)
        bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[0], gopt)
        bwa.cal_sa_reg_gap(0, bwts, seq_pairs_read, bwsa[1], gopt)
        cnt_chg = bwa.cal_pac_pos_pe(bwts, seq_pairs_read, bwsa, ii, popt,
                                     gopt, last_ii)
        sys.stderr.write('ii: %r\n' % [ii.avg, ii.std, ii.low, ii.high])
        bwa.paired_sw(bns, pacseq, seq_pairs_read, bwsa, popt, ii)
        bwa.refine_gapped(bns, seq_pairs_read, bwsa[0], pacseq)
        bwa.refine_gapped(bns, seq_pairs_read, bwsa[1], pacseq)
        analyze_seqs(gopt[0], bns, seq_pairs_read, bwsa)
        bwa.free_seq(N, bwsa[0])
        bwa.free_seq(N, bwsa[1])
コード例 #5
0
ファイル: test_bwa.py プロジェクト: ilveroluca/seal
 def build_bws_array_with_trim_qual(self):
     pairs = self.load_pairs_fixture("pairs.txt")
     bws_array = bwa.build_bws_array(pairs,
                                     qtype="fastq-illumina",
                                     src="qseq",
                                     trim_qual=15)
     sequences = [bws_array[read][i] for i in xrange(5) for read in 0, 1]
     clipped = map(lambda seq: seq.full_len > seq.clip_len, sequences)
     self.assertTrue(any(clipped))
コード例 #6
0
ファイル: test_bwa.py プロジェクト: ilveroluca/seal
 def build_bws_array_with_trim_qual_zero(self):
     pairs = self.load_pairs_fixture("pairs.txt")
     bws_array = bwa.build_bws_array(pairs,
                                     qtype="fastq-illumina",
                                     src="qseq",
                                     trim_qual=0)
     for read in 0, 1:
         for i in xrange(len(pairs)):
             seq = bws_array[read][i]
             self.assertEqual(seq.full_len, seq.clip_len,
                              "unexpected sequence trimming")
コード例 #7
0
ファイル: test_bwa.py プロジェクト: ilveroluca/seal
 def build_bws_array(self):
     # FIXME: plenty of UGLY code
     nseq = 100
     for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa":
         qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality"
         for unknown in False, True:
             g = u.random_reads_generator(nseq,
                                          fmt=fmt,
                                          unknown=unknown,
                                          pe=True)
             bioseq_pairs, qseq_pairs = [], []
             for i, read_pair in enumerate(g):
                 base_name = "foo-%d" % i
                 names = ["%s/%d" % (base_name, j) for j in 1, 2]
                 seq_strings = ["".join(read_pair[j][0]) for j in 0, 1]
                 q_strings = [
                     "".join(
                         chr(x + sg.Q_OFFSET[fmt]) for x in read_pair[j][1])
                     for j in 0, 1
                 ]
                 bioseq_p = [
                     SeqRecord(Seq(seq_strings[j], single_letter_alphabet),
                               id=names[j],
                               name=names[j],
                               description=names[j]) for j in 0, 1
                 ]
                 for j in 0, 1:
                     bioseq_p[j].letter_annotations[qkey] = read_pair[j][1]
                 bioseq_pairs.append(bioseq_p)
                 qseq_pairs.append([base_name])
                 for j in 0, 1:
                     for l in seq_strings, q_strings:
                         qseq_pairs[-1].append(l[j])
             n = len(bioseq_pairs[0][0])
             assert len(qseq_pairs[0][1]) == n
             for src in "bioseq", "qseq":
                 seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs
                 bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src)
                 for j in 0, 1:
                     self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t)
                     for i in xrange(nseq):
                         bwseq = bwsa[j][i]
                         self.assertEqual(bwseq.len, n)
                         if src == "bioseq":
                             exp_name = bioseq_pairs[i][0].name
                         else:
                             exp_name = "%s/%d" % (qseq_pairs[i][0],
                                                   (j + 1))
                         self.assertEqual(len(bwseq.get_name()),
                                          len(exp_name))
コード例 #8
0
def main(argv):

  try:
    refseq_fname = argv[1]
    read_fname = argv[2]
    mate_fname = argv[3]
  except IndexError:
    sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0])

  seq_list_len = 5000
  max_isize = pairing_batch_size = 1000
  gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()

  read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
  mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
  pairs_flow = it.izip(read_flow, mate_flow)
  res = []
  while 1:
    pairs = list(it.islice(pairs_flow, 0, seq_list_len))
    if len(pairs) == 0:
      break
    bwts = bwa.restore_index(refseq_fname)
    bnsp, pacseq = bwa.restore_reference(refseq_fname)

    l = len(pairs)
    bwsa = bwa.build_bws_array(pairs)

    logger = logging.getLogger("test")
    logger.setLevel(logging.DEBUG)
    counters = get_counters()
    ctx = ContextStub()
    visitor = MRVisitor(logger, ctx, counters)

    bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize,
                               pairing_batch_size, visitor)
    for read, mate in bwa_iterator.analyze(bwsa, l):
      print read.get_name(), mate.get_name()

    for j in 0, 1:
      bwa.free_seq(l, bwsa[j])
    bwa.bns_destroy(bwa_iterator.bnsp)

  for cn, c in counters.iteritems():
    sys.stderr.write("%s = %d\n" % (cn, c.value))
コード例 #9
0
def main(argv):

    try:
        refseq_fname = argv[1]
        read_fname = argv[2]
        mate_fname = argv[3]
    except IndexError:
        sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0])

    seq_list_len = 5000
    max_isize = pairing_batch_size = 1000
    gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()

    read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
    mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
    pairs_flow = it.izip(read_flow, mate_flow)
    res = []
    while 1:
        pairs = list(it.islice(pairs_flow, 0, seq_list_len))
        if len(pairs) == 0:
            break
        bwts = bwa.restore_index(refseq_fname)
        bnsp, pacseq = bwa.restore_reference(refseq_fname)

        l = len(pairs)
        bwsa = bwa.build_bws_array(pairs)

        logger = logging.getLogger("test")
        logger.setLevel(logging.DEBUG)
        counters = get_counters()
        ctx = ContextStub()
        visitor = MRVisitor(logger, ctx, counters)

        bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize,
                                   pairing_batch_size, visitor)
        for read, mate in bwa_iterator.analyze(bwsa, l):
            print read.get_name(), mate.get_name()

        for j in 0, 1:
            bwa.free_seq(l, bwsa[j])
        bwa.bns_destroy(bwa_iterator.bnsp)

    for cn, c in counters.iteritems():
        sys.stderr.write("%s = %d\n" % (cn, c.value))
コード例 #10
0
ファイル: test_bwa_memory.py プロジェクト: ilveroluca/seal
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname):
    read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
    mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
    pairs = [x for x in it.izip(read_flow, mate_flow)]
    print_meminfo("AFTER READING PAIRS")
    bwsa = bwa.build_bws_array(pairs)
    print_meminfo("AFTER BUILDING BWSA")
    bwts = bwa.restore_index(refseq_fname)
    print_meminfo("AFTER RESTORING INDEX")
    bnsp, pacseq = bwa.restore_reference(refseq_fname)
    print_meminfo("AFTER RESTORING REFERENCE")
    gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()
    ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t()
    last_ii.avg = -1.0
    l = len(pairs)
    print_meminfo("AFTER INIT OPT & II")
    bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt)
    bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt)
    print_meminfo("AFTER CAL_SA_REG_GAP")
    cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii)
    print_meminfo("AFTER CAL_PAC_POS_PE")
    bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii)
    print_meminfo("AFTER PAIRED_SW")
    bwa.refine_gapped(bnsp, l, bwsa[0], pacseq)
    bwa.refine_gapped(bnsp, l, bwsa[1], pacseq)
    print_meminfo("AFTER REFINE_GAPPED")
    for k in xrange(l):
        v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k])
        v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k])
    print_meminfo("AFTER ANALYZE_HIT")
    # deallocate seq & ref data
    for i in 0, 1:
        bwa.free_seq(l, bwsa[i])
        bwa.bwt_destroy(bwts[i])
    bwa.bns_destroy(bnsp)
    print_meminfo("AFTER DEALLOC")
    del pacseq
    n_unreachable = gc.collect()
    logging.debug("n_unreachable = %d" % n_unreachable)
    print_meminfo("AFTER DEL PACSEQ")
    del pairs
    n_unreachable = gc.collect()
    logging.debug("n_unreachable = %d" % n_unreachable)
    print_meminfo("AFTER DEL PAIRS")
コード例 #11
0
ファイル: test_bwa.py プロジェクト: QwertyManiac/seal-cdh4
 def build_bws_array(self):
   # FIXME: plenty of UGLY code
   nseq = 100
   for fmt in "fastq-sanger", "fastq-illumina", "fastq-solexa":
     qkey = "solexa_quality" if fmt == "fastq-solexa" else "phred_quality"
     for unknown in False, True:
       g = u.random_reads_generator(nseq, fmt=fmt, unknown=unknown, pe=True)
       bioseq_pairs, qseq_pairs = [], []
       for i, read_pair in enumerate(g):
         base_name = "foo-%d" % i
         names = ["%s/%d" % (base_name, j) for j in 1, 2]
         seq_strings = ["".join(read_pair[j][0]) for j in 0, 1]
         q_strings = ["".join(chr(x+sg.Q_OFFSET[fmt]) for x in read_pair[j][1])
                      for j in 0, 1]
         bioseq_p = [SeqRecord(Seq(seq_strings[j], single_letter_alphabet),
                               id=names[j],
                               name=names[j],
                               description=names[j]) for j in 0, 1]
         for j in 0, 1:
           bioseq_p[j].letter_annotations[qkey] = read_pair[j][1]
         bioseq_pairs.append(bioseq_p)
         qseq_pairs.append([base_name])
         for j in 0, 1:
           for l in seq_strings, q_strings:
             qseq_pairs[-1].append(l[j])
       n = len(bioseq_pairs[0][0])
       assert len(qseq_pairs[0][1]) == n
       for src in "bioseq", "qseq":
         seq_pairs = bioseq_pairs if src == "bioseq" else qseq_pairs
         bwsa = bwa.build_bws_array(seq_pairs, qtype=fmt, src=src)
         for j in 0, 1:
           self.assertTrue(type(bwsa[j]) is bwa.bwa_seq_p_t)
           for i in xrange(nseq):
             bwseq = bwsa[j][i]
             self.assertEqual(bwseq.len, n)
             if src == "bioseq":
               exp_name = bioseq_pairs[i][0].name
             else:
               exp_name = "%s/%d" % (qseq_pairs[i][0], (j+1))
             self.assertEqual(len(bwseq.get_name()), len(exp_name))
コード例 #12
0
def main(argv):
  try:
    refseq_fname = argv[1]
    read_fname = argv[2]
    mate_fname = argv[3]
  except IndexError:
    sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0])

  seq_list_len = 10000
  max_isize = pairing_batch_size = 10000
  gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()

  read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
  mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
  pairs_flow = it.izip(read_flow, mate_flow)
  pairs = list(it.islice(pairs_flow, 0, seq_list_len))
  bwts = bwa.restore_index(refseq_fname)
  bnsp, pacseq = bwa.restore_reference(refseq_fname)

  l = len(pairs)
  bwsa = bwa.build_bws_array(pairs)

  bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize,
                             pairing_batch_size)
  pairs = [p for p in bwa_iterator.analyze(bwsa, l)]
  print "READ POS GAPO GAPE MM STRAND SCORE CIGAR"
  for read, mate in pairs:
    if read.n_multi > 0:
      print
      multi_list = [m for m in read.itermulti()]
      for m in multi_list:
        print read.get_name(), m.pos, m.n_gapo, m.n_gape, m.n_mm, m.strand, \
               m.score, m.get_cigar(read.len)

  for j in 0, 1:
    bwa.free_seq(l, bwsa[j])
  bwa.bns_destroy(bwa_iterator.bnsp)
コード例 #13
0
def main(argv):
    try:
        refseq_fname = argv[1]
        read_fname = argv[2]
        mate_fname = argv[3]
    except IndexError:
        sys.exit("Usage: %s REFSEQ_FN READ_FN MATE_FN" % sys.argv[0])

    seq_list_len = 10000
    max_isize = pairing_batch_size = 10000
    gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()

    read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
    mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')
    pairs_flow = it.izip(read_flow, mate_flow)
    pairs = list(it.islice(pairs_flow, 0, seq_list_len))
    bwts = bwa.restore_index(refseq_fname)
    bnsp, pacseq = bwa.restore_reference(refseq_fname)

    l = len(pairs)
    bwsa = bwa.build_bws_array(pairs)

    bwa_iterator = BWAIterator(refseq_fname, gopt, popt, max_isize,
                               pairing_batch_size)
    pairs = [p for p in bwa_iterator.analyze(bwsa, l)]
    print "READ POS GAPO GAPE MM STRAND SCORE CIGAR"
    for read, mate in pairs:
        if read.n_multi > 0:
            print
            multi_list = [m for m in read.itermulti()]
            for m in multi_list:
                print read.get_name(), m.pos, m.n_gapo, m.n_gape, m.n_mm, m.strand, \
                       m.score, m.get_cigar(read.len)

    for j in 0, 1:
        bwa.free_seq(l, bwsa[j])
    bwa.bns_destroy(bwa_iterator.bnsp)
コード例 #14
0
ファイル: test_bwa.py プロジェクト: QwertyManiac/seal-cdh4
 def build_bws_array_with_trim_qual(self):
   pairs = self.load_pairs_fixture("pairs.txt")
   bws_array = bwa.build_bws_array(pairs, qtype="fastq-illumina", src="qseq", trim_qual=15)
   sequences = [ bws_array[read][i] for i in xrange(5) for read in 0,1 ]
   clipped = map(lambda seq: seq.full_len > seq.clip_len, sequences)
   self.assertTrue(any(clipped))
コード例 #15
0
ファイル: test_seq_list_len.py プロジェクト: fversaci/seal
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, seq_list_len=None):
    size_list = []
    resident_list = []
    failed_ii = 0

    read_flow = Bio.SeqIO.parse(open(read_fname), "fastq-illumina")
    mate_flow = Bio.SeqIO.parse(open(mate_fname), "fastq-illumina")

    # pairs = [x for x in it.izip(read_flow, mate_flow)]
    pairs_flow = it.izip(read_flow, mate_flow)

    while 1:
        pairs = list(it.islice(pairs_flow, 0, seq_list_len))
        if len(pairs) == 0:
            break
        size, resident = print_meminfo("AFTER READING PAIRS")
        size_list.append(size)
        resident_list.append(resident)

        bwsa = bwa.build_bws_array(pairs)
        size, resident = print_meminfo("AFTER BUILDING BWSA")
        size_list.append(size)
        resident_list.append(resident)

        bwts = bwa.restore_index(refseq_fname)
        size, resident = print_meminfo("AFTER RESTORING INDEX")
        size_list.append(size)
        resident_list.append(resident)

        bnsp, pacseq = bwa.restore_reference(refseq_fname)
        size, resident = print_meminfo("AFTER RESTORING REFERENCE")
        size_list.append(size)
        resident_list.append(resident)

        gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()
        ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t()
        last_ii.avg = -1.0
        l = len(pairs)
        size, resident = print_meminfo("AFTER INIT OPT & II")
        size_list.append(size)
        resident_list.append(resident)

        bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt)
        bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt)
        size, resident = print_meminfo("AFTER CAL_SA_REG_GAP")
        size_list.append(size)
        resident_list.append(resident)

        cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii)
        size, resident = print_meminfo("AFTER CAL_PAC_POS_PE")
        size_list.append(size)
        resident_list.append(resident)

        # sys.stderr.write("ii=%f\n" % ii.avg)
        if ii.avg < 0.0:
            failed_ii += 1

        bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii)
        size, resident = print_meminfo("AFTER PAIRED_SW")
        size_list.append(size)
        resident_list.append(resident)

        bwa.refine_gapped(bnsp, l, bwsa[0], pacseq)
        bwa.refine_gapped(bnsp, l, bwsa[1], pacseq)
        size, resident = print_meminfo("AFTER REFINE_GAPPED")
        size_list.append(size)
        resident_list.append(resident)

        for k in xrange(l):
            v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k])
            v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k])
        size, resident = print_meminfo("AFTER ANALYZE_HIT")
        size_list.append(size)
        resident_list.append(resident)

        # deallocate seq & ref data
        for i in 0, 1:
            bwa.free_seq(l, bwsa[i])
            bwa.bwt_destroy(bwts[i])
        bwa.bns_destroy(bnsp)
        size, resident = print_meminfo("AFTER DEALLOC")
        size_list.append(size)
        resident_list.append(resident)

        del pacseq
        n_unreachable = gc.collect()
        logging.debug("n_unreachable = %d" % n_unreachable)
        size, resident = print_meminfo("AFTER DEL PACSEQ")
        size_list.append(size)
        resident_list.append(resident)

        del pairs
        n_unreachable = gc.collect()
        logging.debug("n_unreachable = %d" % n_unreachable)
        size, resident = print_meminfo("AFTER DEL PAIRS")
        size_list.append(size)
        resident_list.append(resident)

    return max(size_list), max(resident_list), failed_ii
コード例 #16
0
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, seq_list_len=None):
    size_list = []
    resident_list = []
    failed_ii = 0

    read_flow = Bio.SeqIO.parse(open(read_fname), 'fastq-illumina')
    mate_flow = Bio.SeqIO.parse(open(mate_fname), 'fastq-illumina')

    #pairs = [x for x in it.izip(read_flow, mate_flow)]
    pairs_flow = it.izip(read_flow, mate_flow)

    while 1:
        pairs = list(it.islice(pairs_flow, 0, seq_list_len))
        if len(pairs) == 0:
            break
        size, resident = print_meminfo("AFTER READING PAIRS")
        size_list.append(size)
        resident_list.append(resident)

        bwsa = bwa.build_bws_array(pairs)
        size, resident = print_meminfo("AFTER BUILDING BWSA")
        size_list.append(size)
        resident_list.append(resident)

        bwts = bwa.restore_index(refseq_fname)
        size, resident = print_meminfo("AFTER RESTORING INDEX")
        size_list.append(size)
        resident_list.append(resident)

        bnsp, pacseq = bwa.restore_reference(refseq_fname)
        size, resident = print_meminfo("AFTER RESTORING REFERENCE")
        size_list.append(size)
        resident_list.append(resident)

        gopt, popt = bwa.gap_init_opt(), bwa.pe_init_opt()
        ii, last_ii = bwa.isize_info_t(), bwa.isize_info_t()
        last_ii.avg = -1.0
        l = len(pairs)
        size, resident = print_meminfo("AFTER INIT OPT & II")
        size_list.append(size)
        resident_list.append(resident)

        bwa.cal_sa_reg_gap(0, bwts, l, bwsa[0], gopt)
        bwa.cal_sa_reg_gap(0, bwts, l, bwsa[1], gopt)
        size, resident = print_meminfo("AFTER CAL_SA_REG_GAP")
        size_list.append(size)
        resident_list.append(resident)

        cnt_chg = bwa.cal_pac_pos_pe(bwts, l, bwsa, ii, popt, gopt, last_ii)
        size, resident = print_meminfo("AFTER CAL_PAC_POS_PE")
        size_list.append(size)
        resident_list.append(resident)

        #sys.stderr.write("ii=%f\n" % ii.avg)
        if ii.avg < 0.0:
            failed_ii += 1

        bwa.paired_sw(bnsp, pacseq, l, bwsa, popt, ii)
        size, resident = print_meminfo("AFTER PAIRED_SW")
        size_list.append(size)
        resident_list.append(resident)

        bwa.refine_gapped(bnsp, l, bwsa[0], pacseq)
        bwa.refine_gapped(bnsp, l, bwsa[1], pacseq)
        size, resident = print_meminfo("AFTER REFINE_GAPPED")
        size_list.append(size)
        resident_list.append(resident)

        for k in xrange(l):
            v1 = bwa.analyze_hit(gopt[0], bnsp, bwsa[0][k], bwsa[1][k])
            v2 = bwa.analyze_hit(gopt[0], bnsp, bwsa[1][k], bwsa[0][k])
        size, resident = print_meminfo("AFTER ANALYZE_HIT")
        size_list.append(size)
        resident_list.append(resident)

        # deallocate seq & ref data
        for i in 0, 1:
            bwa.free_seq(l, bwsa[i])
            bwa.bwt_destroy(bwts[i])
        bwa.bns_destroy(bnsp)
        size, resident = print_meminfo("AFTER DEALLOC")
        size_list.append(size)
        resident_list.append(resident)

        del pacseq
        n_unreachable = gc.collect()
        logging.debug("n_unreachable = %d" % n_unreachable)
        size, resident = print_meminfo("AFTER DEL PACSEQ")
        size_list.append(size)
        resident_list.append(resident)

        del pairs
        n_unreachable = gc.collect()
        logging.debug("n_unreachable = %d" % n_unreachable)
        size, resident = print_meminfo("AFTER DEL PAIRS")
        size_list.append(size)
        resident_list.append(resident)

    return max(size_list), max(resident_list), failed_ii