Example #1
0
 def map_isoforms_to_reference_transcripts(self):
     """Map isoforms to reference transcripts."""
     m5out = self.output_analysis_fn + ".blasr.out.m5"
     cmd = 'blasr %s %s --bestn 1 -m 5 --out %s' % \
           (self.isoseq_output_fa, self.reference_transcripts_fn, m5out)
     execute(cmd)
     return [r for r in BLASRM5Reader(m5out)]
Example #2
0
def blasr_against_ref2(output_filename,
                       is_FL,
                       sID_starts_with_c,
                       qver_get_func,
                       qvmean_get_func,
                       qv_prob_threshold=.03,
                       ece_penalty=1,
                       ece_min_len=20,
                       same_strand_only=True,
                       max_missed_start=200,
                       max_missed_end=50,
                       full_missed_start=50,
                       full_missed_end=30):
    """
    Excluding criteria:
    (1) self hit
    (2) opposite strand hit  (should already be in the same orientation;
        can override with <same_strand_only> set to False)
    (3) less than 90% aligned or more than 50 bp missed

    qver_get_func --- should be basQV.basQVcacher.get() or
                      .get_smoothed(), or can just pass in
                      lambda (x, y): 1. to ignore QV
    """
    with BLASRM5Reader(output_filename) as reader:
        for r in reader:
            missed_q = r.qStart + r.qLength - r.qEnd
            missed_t = r.sStart + r.sLength - r.sEnd

            if sID_starts_with_c:
                # because all consensus should start with
                # c<cluster_index>
                assert r.sID.startswith('c')
                if r.sID.find('/') > 0:
                    r.sID = r.sID.split('/')[0]
                if r.sID.endswith('_ref'):
                    # probably c<cid>_ref
                    cID = int(r.sID[1:-4])
                else:
                    cID = int(r.sID[1:])
            else:
                cID = r.sID

            # self hit, useless!
            # opposite strand not allowed!
            if (cID == r.qID or (r.strand == '-' and same_strand_only)):
                yield HitItem(qID=r.qID, cID=cID)
                continue

            # regardless if whether is full-length (is_FL)
            # the query MUST be mapped fully (based on full_missed_start/end)
            if r.qStart > full_missed_start or (r.qLength -
                                                r.qEnd) > full_missed_end:
                yield HitItem(qID=r.qID, cID=cID)

            # full-length case: allow up to max_missed_start bp of 5' not aligned
            # and max_missed_end bp of 3' not aligned
            # non-full-length case: not really tested...don't use
            if is_FL and not alignment_missed_start_end_less_than_threshold(r,\
                            max_missed_start, max_missed_end, full_missed_start, full_missed_end):
                yield HitItem(qID=r.qID, cID=cID)
            else:
                cigar_str, ece_arr = eval_blasr_alignment(
                    record=r,
                    qver_get_func=qver_get_func,
                    qvmean_get_func=qvmean_get_func,
                    sID_starts_with_c=sID_starts_with_c,
                    qv_prob_threshold=qv_prob_threshold)

                if alignment_has_large_nonmatch(ece_arr, ece_penalty,
                                                ece_min_len):
                    yield HitItem(qID=r.qID, cID=cID)
                else:
                    yield HitItem(qID=r.qID,
                                  cID=cID,
                                  qStart=r.qStart,
                                  qEnd=r.qEnd,
                                  missed_q=missed_q * 1. / r.qLength,
                                  missed_t=missed_t * 1. / r.sLength,
                                  fakecigar=cigar_str,
                                  ece_arr=ece_arr)