def cons_self_tandem_record(r_array, low_repeat_ratio=low_repeat_ratio): rlen = 0.0 + pb.get_read_op_length(r_array[0]) cons_len = rlen / 2 for r in r_array: map_len = pb.get_aligned_read_length(r) if map_len / cons_len >= low_repeat_ratio: return r return None
def is_cons_partial(r_array): query_pos, map_len = [], [] for r in r_array: qstart = (0 if r.cigartuples[-1][0] != 4 else r.cigartuples[-1][1] + 1) if r.is_reverse else r.query_alignment_start + 1 query_pos.append(qstart) map_len.append(pb.get_aligned_read_length(r)) for i in range(1, len(query_pos)): if abs(query_pos[i] - query_pos[i - 1]) > 10: return False elif abs(map_len[i] - map_len[i - 1]) > 10: return False return True
def high_qual_record(r_array, high_max_ratio=high_max_ratio, high_min_ratio=high_min_ratio, high_iden_ratio=high_iden_ratio): if not r_array: return None primary_r = r_array[0] # primary_r = ps.AlignedSegment() if primary_r.is_secondary or primary_r.is_supplementary or primary_r.is_unmapped: ut.fatal_format_time('high_qual_record', 'Error: input SAM file is sorted or modified.') primary_start = primary_r.reference_start + 1 primary_end = primary_start + pb.get_ref_op_length(primary_r) - 1 rlen = 0.0 + pb.get_read_op_length(primary_r) cons_len = rlen / 2 best_i = -1 best_r = None best_AS = -1 best_iden_ratio = -1.0 primary_is_high = False for i, r in iter(enumerate(r_array)): map_len = pb.get_aligned_read_length(r) mc = map_len / cons_len iden_ratio = get_iden_ratio(r) AS = int(r.get_tag('AS')) if high_min_ratio <= mc <= high_max_ratio and iden_ratio >= high_iden_ratio: if len(r.reference_name) >= 6 or r.reference_name.startswith( 'chrM') or r.reference_name.startswith('chrUn'): return None if AS > best_AS: if i == 0: primary_is_high = True best_r, best_i, best_AS = r, i, AS # if r is not primary record, r has to NOT overlap with primary r elif r.reference_name != primary_r.reference_name or ( r.reference_start + 1 > primary_end or r.reference_start + pb.get_ref_op_length(r) < primary_start): best_r, best_i, best_AS = r, i, AS if best_i == -1: return None else: return primary_r if primary_is_high else best_r
def cons_repeat_record(r_array, high_iden_ratio=high_iden_ratio, high_repeat_ratio=high_repeat_ratio): map_pos, map_len = [], [] rlen = 0.0 + pb.get_read_op_length(r_array[0]) cons_len = rlen / 2 for r in r_array: map_pos.append(r.reference_start + 1) map_len.append(pb.get_aligned_read_length(r)) for i in range(len(map_pos) - 1): for j in range(i + 1, len(map_pos)): if abs(map_pos[i] - map_pos[j]) <= 10 and \ abs(map_len[i] - map_len[j]) <= 10 and map_len[i] / cons_len <= high_repeat_ratio: if get_iden_ratio( r_array[i]) >= high_iden_ratio and get_iden_ratio( r_array[j]) >= high_iden_ratio: return r_array[i] return None
def get_iden_ratio(r): new_block = 1 ins_len, del_len = 0, 0 for tuples in r.cigartuples: if tuples[0] == pb.BAM_CINS: ins_len += tuples[1] if ins_len > max_ins_len: return -1.0 elif tuples[0] == pb.BAM_CDEL: del_len += tuples[1] if del_len > max_del_len: return -1.0 elif tuples[0] == pb.BAM_CREF_SKIP: ins_len, del_len = 0, 0 map_len = pb.get_aligned_read_length(r) if not r.has_tag('NM'): ut.fatal_format_time('bam_classify', 'No NM tag found.\n') NM = int(r.get_tag('NM')) del_len = int(pb.get_cigar_len(r, pb.BAM_CDEL)) iden_len = map_len - NM + del_len return iden_len / (map_len + 0.0)
def cla_record(r_array, high_bam, low_bam, high_max_ratio=high_max_ratio, high_min_ratio=high_min_ratio, high_iden_ratio=high_iden_ratio, high_repeat_ratio=high_repeat_ratio, low_repeat_ratio=low_repeat_ratio): if not r_array: return if len(r_array) == 1: # single record rlen = 0.0 + pb.get_read_op_length(r_array[0]) cons_len = rlen / 2 map_len = pb.get_aligned_read_length(r_array[0]) mc = map_len / cons_len if high_min_ratio <= mc <= high_max_ratio and get_iden_ratio( r_array[0]) >= high_iden_ratio: high_bam.write(r_array[0]) elif mc >= low_repeat_ratio: low_bam.write(r_array[0]) else: low_bam.write(r_array[0]) else: # multiple records high_r = high_qual_record(r_array, high_max_ratio, high_min_ratio, high_iden_ratio) if high_r: high_bam.write(high_r) else: rep_r = cons_repeat_record(r_array, high_iden_ratio, high_repeat_ratio) if rep_r: # high_bam.write(rep_r) # TODO low_bam.write(rep_r) return self_r = cons_self_tandem_record(r_array, low_repeat_ratio) if self_r: low_bam.write(self_r) else: for r in r_array: if not (r.is_secondary or r.is_supplementary): low_bam.write(r) break