Beispiel #1
0
def cons_self_tandem_record(r_array, low_repeat_ratio=low_repeat_ratio):
    rlen = 0.0 + pb.get_read_op_length(r_array[0])
    cons_len = rlen / 2
    for r in r_array:
        map_len = pb.get_aligned_read_length(r)
        if map_len / cons_len >= low_repeat_ratio:
            return r
    return None
Beispiel #2
0
def is_cons_partial(r_array):
    query_pos, map_len = [], []
    for r in r_array:
        qstart = (0 if r.cigartuples[-1][0] != 4 else r.cigartuples[-1][1] +
                  1) if r.is_reverse else r.query_alignment_start + 1
        query_pos.append(qstart)
        map_len.append(pb.get_aligned_read_length(r))
    for i in range(1, len(query_pos)):
        if abs(query_pos[i] - query_pos[i - 1]) > 10:
            return False
        elif abs(map_len[i] - map_len[i - 1]) > 10:
            return False
    return True
Beispiel #3
0
def high_qual_record(r_array,
                     high_max_ratio=high_max_ratio,
                     high_min_ratio=high_min_ratio,
                     high_iden_ratio=high_iden_ratio):
    if not r_array: return None
    primary_r = r_array[0]
    # primary_r = ps.AlignedSegment()
    if primary_r.is_secondary or primary_r.is_supplementary or primary_r.is_unmapped:
        ut.fatal_format_time('high_qual_record',
                             'Error: input SAM file is sorted or modified.')
    primary_start = primary_r.reference_start + 1
    primary_end = primary_start + pb.get_ref_op_length(primary_r) - 1
    rlen = 0.0 + pb.get_read_op_length(primary_r)
    cons_len = rlen / 2

    best_i = -1
    best_r = None
    best_AS = -1
    best_iden_ratio = -1.0
    primary_is_high = False
    for i, r in iter(enumerate(r_array)):
        map_len = pb.get_aligned_read_length(r)
        mc = map_len / cons_len
        iden_ratio = get_iden_ratio(r)
        AS = int(r.get_tag('AS'))
        if high_min_ratio <= mc <= high_max_ratio and iden_ratio >= high_iden_ratio:
            if len(r.reference_name) >= 6 or r.reference_name.startswith(
                    'chrM') or r.reference_name.startswith('chrUn'):
                return None
            if AS > best_AS:
                if i == 0:
                    primary_is_high = True
                    best_r, best_i, best_AS = r, i, AS
                # if r is not primary record, r has to NOT overlap with primary r
                elif r.reference_name != primary_r.reference_name or (
                        r.reference_start + 1 > primary_end
                        or r.reference_start + pb.get_ref_op_length(r) <
                        primary_start):
                    best_r, best_i, best_AS = r, i, AS

    if best_i == -1:
        return None
    else:
        return primary_r if primary_is_high else best_r
Beispiel #4
0
def cons_repeat_record(r_array,
                       high_iden_ratio=high_iden_ratio,
                       high_repeat_ratio=high_repeat_ratio):
    map_pos, map_len = [], []
    rlen = 0.0 + pb.get_read_op_length(r_array[0])
    cons_len = rlen / 2
    for r in r_array:
        map_pos.append(r.reference_start + 1)
        map_len.append(pb.get_aligned_read_length(r))

    for i in range(len(map_pos) - 1):
        for j in range(i + 1, len(map_pos)):
            if abs(map_pos[i] - map_pos[j]) <= 10 and \
                    abs(map_len[i] - map_len[j]) <= 10 and map_len[i] / cons_len <= high_repeat_ratio:
                if get_iden_ratio(
                        r_array[i]) >= high_iden_ratio and get_iden_ratio(
                            r_array[j]) >= high_iden_ratio:
                    return r_array[i]
    return None
Beispiel #5
0
def get_iden_ratio(r):
    new_block = 1
    ins_len, del_len = 0, 0
    for tuples in r.cigartuples:
        if tuples[0] == pb.BAM_CINS:
            ins_len += tuples[1]
            if ins_len > max_ins_len: return -1.0
        elif tuples[0] == pb.BAM_CDEL:
            del_len += tuples[1]
            if del_len > max_del_len: return -1.0
        elif tuples[0] == pb.BAM_CREF_SKIP:
            ins_len, del_len = 0, 0
    map_len = pb.get_aligned_read_length(r)
    if not r.has_tag('NM'):
        ut.fatal_format_time('bam_classify', 'No NM tag found.\n')
    NM = int(r.get_tag('NM'))
    del_len = int(pb.get_cigar_len(r, pb.BAM_CDEL))
    iden_len = map_len - NM + del_len
    return iden_len / (map_len + 0.0)
Beispiel #6
0
def cla_record(r_array,
               high_bam,
               low_bam,
               high_max_ratio=high_max_ratio,
               high_min_ratio=high_min_ratio,
               high_iden_ratio=high_iden_ratio,
               high_repeat_ratio=high_repeat_ratio,
               low_repeat_ratio=low_repeat_ratio):
    if not r_array: return

    if len(r_array) == 1:  # single record
        rlen = 0.0 + pb.get_read_op_length(r_array[0])
        cons_len = rlen / 2

        map_len = pb.get_aligned_read_length(r_array[0])
        mc = map_len / cons_len
        if high_min_ratio <= mc <= high_max_ratio and get_iden_ratio(
                r_array[0]) >= high_iden_ratio:
            high_bam.write(r_array[0])
        elif mc >= low_repeat_ratio:
            low_bam.write(r_array[0])
        else:
            low_bam.write(r_array[0])
    else:  # multiple records
        high_r = high_qual_record(r_array, high_max_ratio, high_min_ratio,
                                  high_iden_ratio)
        if high_r:
            high_bam.write(high_r)
        else:
            rep_r = cons_repeat_record(r_array, high_iden_ratio,
                                       high_repeat_ratio)
            if rep_r:
                # high_bam.write(rep_r) # TODO
                low_bam.write(rep_r)
                return
            self_r = cons_self_tandem_record(r_array, low_repeat_ratio)
            if self_r:
                low_bam.write(self_r)
            else:
                for r in r_array:
                    if not (r.is_secondary or r.is_supplementary):
                        low_bam.write(r)
                        break