def softclip_end_of_alignment_by_query( rec: AlignedSegment, bases_to_clip: int, clipped_base_quality: Optional[int] = None, tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE ) -> ClippingInfo: """ Adds soft-clipping to the end of a read's alignment. Clipping is applied before any existing hard or soft clipping. E.g. a read with cigar 100M5S that is clipped with bases_to_clip=10 will yield a cigar of 90M15S. If the read is unmapped or bases_to_clip < 1 then nothing is done. If the read has fewer clippable bases than requested the read will be unmapped. Args: rec: the BAM record to clip bases_to_clip: the number of additional bases of clipping desired in the read/query clipped_base_quality: if not None, set bases in the clipped region to this quality tags_to_invalidate: the set of extended attributes to remove upon clipping Returns: ClippingInfo: a named tuple containing the number of query/read bases and the number of target/reference bases clipped. """ if rec.is_unmapped or bases_to_clip < 1: return ClippingInfo(0, 0) num_clippable_bases = rec.query_alignment_length if bases_to_clip >= num_clippable_bases: return _clip_whole_read(rec, tags_to_invalidate) # Reverse the cigar and qualities so we can clip from the start cigar = Cigar.from_cigartuples(rec.cigartuples).reversed() quals = rec.query_qualities quals.reverse() new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality) # Then reverse everything back again quals.reverse() rec.query_qualities = quals rec.cigarstring = str(new_cigar.reversed()) _cleanup(rec, tags_to_invalidate) return clipping_info
def _set_length_dependent_fields( self, rec: pysam.AlignedSegment, length: int, bases: Optional[str] = None, quals: Optional[List[int]] = None, cigar: Optional[str] = None, ) -> None: """Fills in bases, quals and cigar on a record. If any of bases, quals or cigar are defined, they must all have the same length/query length. If none are defined then the length parameter is used. Undefined values are synthesize at the inferred length. Args: rec: a SAM record length: the length to use if all of bases/quals/cigar are None bases: an optional string of bases for the read quals: an optional list of qualities for the read cigar: an optional cigar string for the read """ # Do some validation to make sure all defined things have the same lengths lengths = set() if bases is not None: lengths.add(len(bases)) if quals is not None: lengths.add(len(quals)) if cigar is not None: cig = sam.Cigar.from_cigarstring(cigar) lengths.add(sum([elem.length_on_query for elem in cig.elements])) if not lengths: lengths.add(length) if len(lengths) != 1: raise ValueError( "Provided bases/quals/cigar are not length compatible.") # Fill in the record, making any parts that were not defined as params length = lengths.pop() rec.query_sequence = bases if bases else self._bases(length) rec.query_qualities = quals if quals else [self.base_quality] * length if not rec.is_unmapped: rec.cigarstring = cigar if cigar else f"{length}M"