def _merge_circular_aligment(target_1, start_pos_1, cigar_str_1, target_2, start_pos_2, cigar_str_2, is_reversed, qname): if is_reversed: # reverse back both cigar_str_1 = ''.join(reversed(cigar_str_1)) target_1 = butil.reverse_complement(target_1) cigar_str_2 = ''.join(reversed(cigar_str_2)) target_2 = butil.reverse_complement(target_2) if start_pos_1 == 0: start = start_pos_2 cigar = butil.rtrim_cigar(cigar_str_2) + butil.ltrim_cigar(cigar_str_1) target = target_2 + target_1 elif start_pos_2 == 0: start = start_pos_1 cigar = butil.rtrim_cigar(cigar_str_1) + butil.ltrim_cigar(cigar_str_2) target = target_1 + target_2 else: # not circular, duplicate logging.error("Duplicate read with name %s", qname) return None if is_reversed: cigar = ''.join(reversed(cigar)) target = butil.reverse_complement(target) return [target, start, cigar]
def extend_cigars_in_sam(sam_in, ref_path, fastx_path, sam_out=None): tmp_dir = None tmp_sam_out = sam_out inplace = sam_out is None if inplace: # inplace change using tmp file tmp_dir = tempfile.mkdtemp() tmp_sam_out = os.path.join(tmp_dir, 'tmp.sam') ref = butil.read_fasta(ref_path) reads = {} with pysam.FastxFile(fastx_path, 'r') as fh: for r in fh: reads[r.name] = r with pysam.AlignmentFile(sam_in, "r") as in_sam, \ pysam.AlignmentFile(tmp_sam_out, "w", template=in_sam) as out_sam: for x in tqdm(in_sam.fetch(), unit='reads'): if x.query_name not in reads: logging.warning("read %s in sam not found in .fastx", x.query_name) continue if x.is_unmapped: logging.warning("read %s is unmapped, copy to out sam as is", x.query_name) out_sam.write(x) continue read_seq = reads[x.query_name].sequence ref_seq = ref[x.reference_start:x.reference_end] cigar_pairs = x.cigartuples if x.is_reverse: read_seq = butil.reverse_complement(read_seq) x.cigarstring = extend_cigar(read_seq, ref_seq, cigar_pairs) out_sam.write(x) if inplace: # clear tmp files shutil.move(tmp_sam_out, sam_in) shutil.rmtree(tmp_dir)
def get_target_sequences(sam_path): result_dict = {} cnt = defaultdict(int) with pysam.AlignmentFile(sam_path, "r") as samfile: for x in tqdm(samfile.fetch(), desc='Building ref'): name = x.query_name cnt['total'] += 1 if x.is_unmapped: cnt['unmapped'] += 1 #logging.warning("%s unmapped" % name) continue try: # hack to bypass segfault full_cigar = butil.decompress_cigar_pairs(x.cigartuples) r_len = butil.get_read_len_from_cigar(full_cigar) ref_len = butil.get_ref_len_from_cigar(full_cigar) if r_len != x.query_length or ref_len != x.reference_length: logging.error( "%s cigar operations do not match alignment info in md", name) cnt['invalid_md_cigar'] += 1 continue target = x.get_reference_sequence() except (ValueError, AssertionError) as e: cnt['missign_ref'] += 1 logging.error( "%s Mapped but reference len equals 0, md tag: %s", name, x.has_tag('MD')) continue ref_name = x.reference_name length = x.reference_length start_pos = x.reference_start cigar_pairs = x.cigartuples if x.is_reverse: target = butil.reverse_complement(target) cigar_pairs = list(reversed(cigar_pairs)) cigar_str = butil.decompress_cigar_pairs(cigar_pairs, mode='ints') if name in result_dict: prev_target, _, prev_start_pos, _, prev_cigar_str = result_dict[ name] merged = _merge_circular_aligment(prev_target, prev_start_pos, prev_cigar_str, target, start_pos, cigar_str, x.is_reverse, x.query_name) if not merged: continue target, start_pos, cigar_str = merged length = len(target) result_dict[name] = [ target, ref_name, start_pos, length, cigar_str ] logging.warning("Results: %s", str(cnt.items())) return result_dict