def split_aligment(x, split_length=CIGAR_OPS_LIMIT): read_len = x.query_length ref = x.get_reference_sequence() cigar = butil.decompress_cigar_pairs(x.cigar) if read_len < split_length: return [x] ret = [] prev_query_end = 0 ref_len = 0 for id, i in enumerate(range(0, len(cigar), split_length)): prefix_cigar = cigar[i:i + split_length] prefix_len = butil.get_read_len_from_cigar(prefix_cigar) prefix = deepcopy(x) prefix.reference_start += ref_len prefix.cigar = butil.compress_cigar(prefix_cigar, 'ints') prefix.set_tag('MD', butil.generate_md_tag(ref[ref_len:], prefix.cigar)) prefix.query_sequence = x.query_sequence[ prev_query_end:prev_query_end + prefix_len] prev_query_end += prefix_len ref_len += butil.get_ref_len_from_cigar(prefix_cigar) ret.append(prefix) return ret
def extend_cigar(read_seq, ref_seq, cigar_pairs, mode='ints'): cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode) ref_seq = butil.reference_align_string(ref_seq, cigar_pairs) read_seq = butil.query_align_string(read_seq, cigar_pairs) assert len(ref_seq) == len(cigar_str) and len(read_seq) == len(cigar_str) def _resolve_m(i, op): if op.upper() == 'M': return '=' if ref_seq[i].upper() == read_seq[i].upper() else 'X' return op.upper() cigar_str = ''.join(_resolve_m(*p) for p in enumerate(cigar_str)) pairs = butil.compress_cigar(cigar_str) cigar = butil.cigar_pairs_to_str(pairs, 'chars') return cigar
def get_target_sequences(sam_path): result_dict = {} cnt = defaultdict(int) with pysam.AlignmentFile(sam_path, "r") as samfile: for x in tqdm(samfile.fetch(), desc='Building ref'): name = x.query_name cnt['total'] += 1 if x.is_unmapped: cnt['unmapped'] += 1 #logging.warning("%s unmapped" % name) continue try: # hack to bypass segfault full_cigar = butil.decompress_cigar_pairs(x.cigartuples) r_len = butil.get_read_len_from_cigar(full_cigar) ref_len = butil.get_ref_len_from_cigar(full_cigar) if r_len != x.query_length or ref_len != x.reference_length: logging.error( "%s cigar operations do not match alignment info in md", name) cnt['invalid_md_cigar'] += 1 continue target = x.get_reference_sequence() except (ValueError, AssertionError) as e: cnt['missign_ref'] += 1 logging.error( "%s Mapped but reference len equals 0, md tag: %s", name, x.has_tag('MD')) continue ref_name = x.reference_name length = x.reference_length start_pos = x.reference_start cigar_pairs = x.cigartuples if x.is_reverse: target = butil.reverse_complement(target) cigar_pairs = list(reversed(cigar_pairs)) cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode='ints') if name in result_dict: prev_target, _, prev_start_pos, _, prev_cigar_str = result_dict[ name] merged = _merge_circular_aligment(prev_target, prev_start_pos, prev_cigar_str, target, start_pos, cigar_str, x.is_reverse, x.query_name) if not merged: continue target, start_pos, cigar_str = merged length = len(target) result_dict[name] = [ target, ref_name, start_pos, length, cigar_str ] logging.warning("Results: %s", str(cnt.items())) return result_dict