def merge_annotated_clusters( biggest: pysam.AlignedSegment, other: pysam.AlignedSegment) -> pysam.AlignedSegment: """Merges 2 annotated clusters together. Merges 2 annotated aligned segments, each representing a cluster. Merges the smaller into the larger. Adds the read number of the 2nd cluster to the first. Args: biggest: The larger of the 2 clusters, with a higher read number. other: The smaller of the 2 clusters, with a lower read number. Returns: The annotated aligned segment representing the merged cluster. """ merged_id = biggest.get_tag(CLUSTER_ID_TAG) if not merged_id.endswith("+"): merged_id = merged_id + "+" biggest.set_tag(CLUSTER_ID_TAG, merged_id, "Z") total_reads = biggest.get_tag(NUM_READS_TAG) + other.get_tag(NUM_READS_TAG) biggest.set_tag(NUM_READS_TAG, total_reads, "i") return biggest
def check_read_quality(sam_record: pysam.AlignedSegment, run_info): """ Process an individual sam read and return quality attributes. """ read_ID = sam_record.query_name flag = sam_record.flag cigar = sam_record.cigarstring seq = sam_record.query read_length = sam_record.query_length dataset = sam_record.get_tag('RG') # Only use uniquely mapped transcripts if flag not in [0, 16]: return [dataset, read_ID, 0, 0, read_length, "NA", "NA"] # Only use reads that are greater than or equal to length threshold if read_length < run_info.min_length: return [dataset, read_ID, 0, 1, read_length, "NA", "NA"] # Locate the MD field of the sam transcript try: md_tag = sam_record.get_tag('MD') except KeyError: raise ValueError("SAM transcript %s lacks an MD tag" % read_ID) # Only use reads where alignment coverage and identity exceed # cutoffs coverage = compute_alignment_coverage(cigar) identity = compute_alignment_identity(md_tag, seq) if coverage < run_info.min_coverage or \ identity < run_info.min_identity: return [dataset, read_ID, 0, 1, read_length, coverage, identity] # At this point, the read has passed the quality control return [dataset, read_ID, 1, 1, read_length, coverage, identity]
def parse_chromium_bamread_metadata(alignment: pysam.AlignedSegment): """ return the readname, error-corrected cellbarvode and error corrected UMI" according to: 'https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam' """ cellbarcode = alignment.get_tag('CB') umi = alignment.get_tag('UB') readname = alignment.query_name return readname, cellbarcode, umi
def from_aligned_segment(cls, align: pysam.AlignedSegment) -> "AlignmentRecord": """Extract information from a pysam Aligned segment""" read_name, read_idx, align_idx = align.query_name.split(":") read_idx, align_idx = int(read_idx), int(align_idx) if align.is_unmapped: align_cat = "unmapped" chrom, start, end, align_score = "NULL", 0, 0, 0 read_length = align.query_length quals = align.query_qualities # TODO: handle this more gracefully if quals is None: align_base_qscore = 0 else: align_base_qscore = mean_qscore(np.array( align.query_qualities)) else: chrom, start, end = (align.reference_name, align.reference_start, align.reference_end) read_length = align.infer_read_length() align_score = align.get_tag("AS") align_base_qscore = mean_qscore( np.array(align.query_alignment_qualities)) if align.is_secondary: align_cat = "secondary" elif align.is_supplementary: align_cat = "supplementary" else: align_cat = "primary" optional = {} for key, tag in [("haplotype", "HP"), ("phase_set", "PS"), ("phase_qual", "PC")]: if align.has_tag(tag): optional[key] = int(align.get_tag(tag)) return cls( read_idx=read_idx, align_idx=align_idx, align_type=align_cat, chrom=chrom, start=start, end=end, strand=not align.is_reverse, read_name=read_name, read_length=read_length, read_start=align.query_alignment_start, read_end=align.query_alignment_end, mapping_quality=align.mapq, align_score=align_score, align_base_qscore=np.rint(align_base_qscore), **optional, )
def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]: """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set. If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be returned. Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed. Args: rec: the record to fail """ if not rec.is_qcfail or not rec.has_tag(QcFailToolTag): return None else: tool_value = rec.get_tag(QcFailToolTag) reason_value = rec.get_tag(QcFailReasonTag) return (tool_value, reason_value)
def get_introns(sam_record: pysam.AlignedSegment, start, cigar): """ Locates the jI field in a list of SAM fields or computes it from the CIGAR string and start position if it isn't found. Note that positions refer to start and endpoints of introns, not exons, so adjustments are needed to avoid an off-by-one error if you want exons. Example jI strings: no introns: jI:B:i,-1 two introns: jI:B:i,167936516,167951806,167951862,167966628 Args: sam_record: a pysam AlignedSegment start: The start position of the transcript with respect to the forward strand cigar: SAM CIGAR string describing match operations to the reference genome Returns: intron_list: intron starts and ends in a list (sorted order) """ try: intron_list = sam_record.get_tag("jI").tolist() except KeyError: jI = compute_jI(start, cigar) intron_list = [int(x) for x in jI.split(",")[1:]] if intron_list[0] == -1: return [] else: return intron_list
def add_tags(alignedSegment: pysam.AlignedSegment) -> pysam.AlignedSegment: """ Takes an AlignedSegment and add percent identity and alignment length as tags alignment length = MID mismatches = NM percent identity = (MID - NM) / MID The percent identity is a value between 0.0 and 1.0 If the segment is unmapped then it is returned as with a percent identity of 0 and an alignment length of 0. :param alignedSegment: The pysam AlignedSegment object :return: alignedSegment: The updated pysam AlignedSegment object """ # Assuming that if the id tag is present that the other tags are also there. if alignedSegment.has_tag('id'): return alignedSegment if alignedSegment.is_unmapped: alignedSegment.set_tag('id', 0.0, 'f') alignedSegment.set_tag('al', 0, 'i') alignedSegment.set_tag('qc', 0.0, 'f') return alignedSegment alnlength = sum(alignedSegment.get_cigar_stats()[0][0:3]) query_covered_bases = sum(alignedSegment.get_cigar_stats()[0][0:2]) query_length = alignedSegment.infer_read_length() mismatches = alignedSegment.get_tag('NM') percid = (alnlength - mismatches) / float(alnlength) qcov = query_covered_bases / float(query_length) alignedSegment.set_tag('id', percid, 'f') alignedSegment.set_tag('qc', qcov, 'f') alignedSegment.set_tag('al', alnlength, 'i') return alignedSegment
def aggregate(self, record: pysam.AlignedSegment): if self.maxMapQ < record.mapping_quality: self.maxMapQ = record.mapping_quality try: startPos = record.get_tag('OS') # type: int except KeyError: startPos = record.reference_start recordItr = CigarIterator(record) recordItr.skipClipped() i = startPos - record.reference_start while recordItr.valid: if len(self.cols) <= i: op = self.Op(recordItr.op, recordItr.seqBase) op += recordItr.baseQual or 0 pos = {} pos[(op.op, op.allele)] = op self.cols.append(pos) else: op = self.cols[i].get((recordItr.op, recordItr.seqBase)) if op: op += recordItr.baseQual or 0 else: self.cols[i][(recordItr.op, recordItr.seqBase)] = self.Op( recordItr.op, recordItr.seqBase) i += 1 recordItr.next() self.members.append(record.query_name)
def get_barcode_for_alignment(alignment: pysam.AlignedSegment, tags: List[str], raise_missing: bool) -> str: """ Get the barcode for an Alignment :param alignment: pysam.AlignedSegment An Alignment from pysam. :param tags: List[str] Tags in the bam that might contain barcodes. If multiple Tags are passed, will return the contents of the first tag that contains a barcode. :param raise_missing: bool Raise an error if no barcodes can be found. :return: str A barcode for the alignment, or None if one is not found and raise_missing is False. """ alignment_barcode = None for tag in tags: # The non-existent barcode should be the exceptional case, so try/except is faster than if/else try: alignment_barcode = alignment.get_tag(tag) break # Got the key, don't bother getting the next tag except KeyError: continue # Try to get the next tag if raise_missing and alignment_barcode is None: raise RuntimeError( "Alignment encountered that is missing {} tag(s).".format(tags)) return alignment_barcode
def get_tag_or_default(alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None) -> Optional[str]: """Extracts the value associated to `tag_key` from `alignment`, and returns a default value if the tag is not present.""" try: return alignment.get_tag(tag_key) except KeyError: return default