def add_tags(alignedSegment: pysam.AlignedSegment) -> pysam.AlignedSegment: """ Takes an AlignedSegment and add percent identity and alignment length as tags alignment length = MID mismatches = NM percent identity = (MID - NM) / MID The percent identity is a value between 0.0 and 1.0 If the segment is unmapped then it is returned as with a percent identity of 0 and an alignment length of 0. :param alignedSegment: The pysam AlignedSegment object :return: alignedSegment: The updated pysam AlignedSegment object """ # Assuming that if the id tag is present that the other tags are also there. if alignedSegment.has_tag('id'): return alignedSegment if alignedSegment.is_unmapped: alignedSegment.set_tag('id', 0.0, 'f') alignedSegment.set_tag('al', 0, 'i') alignedSegment.set_tag('qc', 0.0, 'f') return alignedSegment alnlength = sum(alignedSegment.get_cigar_stats()[0][0:3]) query_covered_bases = sum(alignedSegment.get_cigar_stats()[0][0:2]) query_length = alignedSegment.infer_read_length() mismatches = alignedSegment.get_tag('NM') percid = (alnlength - mismatches) / float(alnlength) qcov = query_covered_bases / float(query_length) alignedSegment.set_tag('id', percid, 'f') alignedSegment.set_tag('qc', qcov, 'f') alignedSegment.set_tag('al', alnlength, 'i') return alignedSegment
def from_aligned_segment(cls, align: pysam.AlignedSegment) -> "AlignmentRecord": """Extract information from a pysam Aligned segment""" read_name, read_idx, align_idx = align.query_name.split(":") read_idx, align_idx = int(read_idx), int(align_idx) if align.is_unmapped: align_cat = "unmapped" chrom, start, end, align_score = "NULL", 0, 0, 0 read_length = align.query_length quals = align.query_qualities # TODO: handle this more gracefully if quals is None: align_base_qscore = 0 else: align_base_qscore = mean_qscore(np.array( align.query_qualities)) else: chrom, start, end = (align.reference_name, align.reference_start, align.reference_end) read_length = align.infer_read_length() align_score = align.get_tag("AS") align_base_qscore = mean_qscore( np.array(align.query_alignment_qualities)) if align.is_secondary: align_cat = "secondary" elif align.is_supplementary: align_cat = "supplementary" else: align_cat = "primary" optional = {} for key, tag in [("haplotype", "HP"), ("phase_set", "PS"), ("phase_qual", "PC")]: if align.has_tag(tag): optional[key] = int(align.get_tag(tag)) return cls( read_idx=read_idx, align_idx=align_idx, align_type=align_cat, chrom=chrom, start=start, end=end, strand=not align.is_reverse, read_name=read_name, read_length=read_length, read_start=align.query_alignment_start, read_end=align.query_alignment_end, mapping_quality=align.mapq, align_score=align_score, align_base_qscore=np.rint(align_base_qscore), **optional, )