Exemple #1
0
def merge_annotated_clusters(
        biggest: pysam.AlignedSegment,
        other: pysam.AlignedSegment) -> pysam.AlignedSegment:
    """Merges 2 annotated clusters together.

    Merges 2 annotated aligned segments, each representing a cluster. Merges the
    smaller into the larger. Adds the read number of the 2nd cluster to the first.

    Args:
        biggest: The larger of the 2 clusters, with a higher read number.
        other: The smaller of the 2 clusters, with a lower read number.

    Returns:
        The annotated aligned segment representing the merged cluster.
    """

    merged_id = biggest.get_tag(CLUSTER_ID_TAG)
    if not merged_id.endswith("+"):
        merged_id = merged_id + "+"
    biggest.set_tag(CLUSTER_ID_TAG, merged_id, "Z")

    total_reads = biggest.get_tag(NUM_READS_TAG) + other.get_tag(NUM_READS_TAG)
    biggest.set_tag(NUM_READS_TAG, total_reads, "i")

    return biggest
def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
    """ Process an individual sam read and return quality attributes. """
    read_ID = sam_record.query_name
    flag = sam_record.flag
    cigar = sam_record.cigarstring
    seq = sam_record.query
    read_length = sam_record.query_length
    dataset = sam_record.get_tag('RG')

    # Only use uniquely mapped transcripts
    if flag not in [0, 16]:
        return [dataset, read_ID, 0, 0, read_length, "NA", "NA"]

    # Only use reads that are greater than or equal to length threshold
    if read_length < run_info.min_length:
        return [dataset, read_ID, 0, 1, read_length, "NA", "NA"]

    # Locate the MD field of the sam transcript
    try:
        md_tag = sam_record.get_tag('MD')
    except KeyError:
        raise ValueError("SAM transcript %s lacks an MD tag" % read_ID)

    # Only use reads where alignment coverage and identity exceed
    # cutoffs
    coverage = compute_alignment_coverage(cigar)
    identity = compute_alignment_identity(md_tag, seq)

    if coverage < run_info.min_coverage or \
       identity < run_info.min_identity:
        return [dataset, read_ID, 0, 1, read_length, coverage, identity]

    # At this point, the read has passed the quality control
    return [dataset, read_ID, 1, 1, read_length, coverage, identity]
Exemple #3
0
def parse_chromium_bamread_metadata(alignment: pysam.AlignedSegment):
    """
    return the readname, error-corrected cellbarvode and error corrected UMI"
    according to:
    'https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam'
    """
    cellbarcode = alignment.get_tag('CB')
    umi = alignment.get_tag('UB')
    readname = alignment.query_name
    return readname, cellbarcode, umi
Exemple #4
0
    def from_aligned_segment(cls,
                             align: pysam.AlignedSegment) -> "AlignmentRecord":
        """Extract information from a pysam Aligned segment"""
        read_name, read_idx, align_idx = align.query_name.split(":")
        read_idx, align_idx = int(read_idx), int(align_idx)

        if align.is_unmapped:
            align_cat = "unmapped"
            chrom, start, end, align_score = "NULL", 0, 0, 0
            read_length = align.query_length
            quals = align.query_qualities
            # TODO: handle this more gracefully
            if quals is None:
                align_base_qscore = 0
            else:
                align_base_qscore = mean_qscore(np.array(
                    align.query_qualities))
        else:
            chrom, start, end = (align.reference_name, align.reference_start,
                                 align.reference_end)
            read_length = align.infer_read_length()
            align_score = align.get_tag("AS")
            align_base_qscore = mean_qscore(
                np.array(align.query_alignment_qualities))
            if align.is_secondary:
                align_cat = "secondary"
            elif align.is_supplementary:
                align_cat = "supplementary"
            else:
                align_cat = "primary"

        optional = {}
        for key, tag in [("haplotype", "HP"), ("phase_set", "PS"),
                         ("phase_qual", "PC")]:
            if align.has_tag(tag):
                optional[key] = int(align.get_tag(tag))
        return cls(
            read_idx=read_idx,
            align_idx=align_idx,
            align_type=align_cat,
            chrom=chrom,
            start=start,
            end=end,
            strand=not align.is_reverse,
            read_name=read_name,
            read_length=read_length,
            read_start=align.query_alignment_start,
            read_end=align.query_alignment_end,
            mapping_quality=align.mapq,
            align_score=align_score,
            align_base_qscore=np.rint(align_base_qscore),
            **optional,
        )
Exemple #5
0
def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]:
    """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set.

    If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be
    returned.  Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed.

    Args:
        rec: the record to fail
    """
    if not rec.is_qcfail or not rec.has_tag(QcFailToolTag):
        return None
    else:
        tool_value = rec.get_tag(QcFailToolTag)
        reason_value = rec.get_tag(QcFailReasonTag)
        return (tool_value, reason_value)
def get_introns(sam_record: pysam.AlignedSegment, start, cigar):
    """ Locates the jI field in a list of SAM fields or computes
        it from the CIGAR string and start position if it isn't found.
        Note that positions refer to start and endpoints of introns, not exons,
        so adjustments are needed to avoid an off-by-one error if you want exons.

        Example jI strings:
            no introns: jI:B:i,-1
            two introns: jI:B:i,167936516,167951806,167951862,167966628
        Args:
            sam_record: a pysam AlignedSegment
            start: The start position of the transcript with respect to the
            forward strand
            cigar: SAM CIGAR string describing match operations to the reference
            genome
        Returns:
            intron_list: intron starts and ends in a list (sorted order)
    """
    try:
        intron_list = sam_record.get_tag("jI").tolist()
    except KeyError:
        jI = compute_jI(start, cigar)
        intron_list = [int(x) for x in jI.split(",")[1:]]

    if intron_list[0] == -1:
        return []
    else:
        return intron_list
Exemple #7
0
def add_tags(alignedSegment: pysam.AlignedSegment) -> pysam.AlignedSegment:
    """ Takes an AlignedSegment and add percent identity and alignment length as tags
    alignment length = MID
    mismatches = NM
    percent identity = (MID - NM) / MID
    The percent identity is a value between 0.0 and 1.0
    If the segment is unmapped then it is returned as with a percent identity of 0
    and an alignment length of 0.
    :param alignedSegment: The pysam AlignedSegment object
    :return: alignedSegment: The updated pysam AlignedSegment object
    """

    # Assuming that if the id tag is present that the other tags are also there.
    if alignedSegment.has_tag('id'):
        return alignedSegment
    if alignedSegment.is_unmapped:
        alignedSegment.set_tag('id', 0.0, 'f')
        alignedSegment.set_tag('al', 0, 'i')
        alignedSegment.set_tag('qc', 0.0, 'f')
        return alignedSegment

    alnlength = sum(alignedSegment.get_cigar_stats()[0][0:3])

    query_covered_bases = sum(alignedSegment.get_cigar_stats()[0][0:2])

    query_length = alignedSegment.infer_read_length()
    mismatches = alignedSegment.get_tag('NM')
    percid = (alnlength - mismatches) / float(alnlength)
    qcov = query_covered_bases / float(query_length)
    alignedSegment.set_tag('id', percid, 'f')
    alignedSegment.set_tag('qc', qcov, 'f')
    alignedSegment.set_tag('al', alnlength, 'i')
    return alignedSegment
 def aggregate(self, record: pysam.AlignedSegment):
     if self.maxMapQ < record.mapping_quality:
         self.maxMapQ = record.mapping_quality
     try:
         startPos = record.get_tag('OS')  # type: int
     except KeyError:
         startPos = record.reference_start
     recordItr = CigarIterator(record)
     recordItr.skipClipped()
     i = startPos - record.reference_start
     while recordItr.valid:
         if len(self.cols) <= i:
             op = self.Op(recordItr.op, recordItr.seqBase)
             op += recordItr.baseQual or 0
             pos = {}
             pos[(op.op, op.allele)] = op
             self.cols.append(pos)
         else:
             op = self.cols[i].get((recordItr.op, recordItr.seqBase))
             if op:
                 op += recordItr.baseQual or 0
             else:
                 self.cols[i][(recordItr.op, recordItr.seqBase)] = self.Op(
                     recordItr.op, recordItr.seqBase)
         i += 1
         recordItr.next()
     self.members.append(record.query_name)
Exemple #9
0
def get_barcode_for_alignment(alignment: pysam.AlignedSegment, tags: List[str],
                              raise_missing: bool) -> str:
    """ Get the barcode for an Alignment

    :param alignment: pysam.AlignedSegment
        An Alignment from pysam.
    :param tags: List[str]
        Tags in the bam that might contain barcodes. If multiple Tags are passed, will
        return the contents of the first tag that contains a barcode.
    :param raise_missing: bool
        Raise an error if no barcodes can be found.
    :return: str
        A barcode for the alignment, or None if one is not found and raise_missing is False.
    """
    alignment_barcode = None
    for tag in tags:
        # The non-existent barcode should be the exceptional case, so try/except is faster than if/else
        try:
            alignment_barcode = alignment.get_tag(tag)
            break  # Got the key, don't bother getting the next tag
        except KeyError:
            continue  # Try to get the next tag

    if raise_missing and alignment_barcode is None:
        raise RuntimeError(
            "Alignment encountered that is missing {} tag(s).".format(tags))

    return alignment_barcode
Exemple #10
0
def get_tag_or_default(alignment: pysam.AlignedSegment,
                       tag_key: str,
                       default: Optional[str] = None) -> Optional[str]:
    """Extracts the value associated to `tag_key` from `alignment`, and returns a default value
    if the tag is not present."""
    try:
        return alignment.get_tag(tag_key)
    except KeyError:
        return default