Beispiel #1
0
    def __encrypt_unmapped(alignment: pysam.AlignedSegment, secret: bytes):
        """
        Stream cipher encryption / decryption.
        alignment + secret => encrypted_alignment
        encrypted_alignment + secret => alignment
        :param alignment:
        :param secret:
        :return: encrypter/decrypted alignment
        """
        if alignment.is_unmapped:
            if secret is None:
                raise ValueError(
                    'Secret key must be present when unmapped alignments are iterated.'
                )

            # use 64B long hash (encrypts 256 bases)
            sha512 = hashlib.sha512()
            sha512.update(secret + alignment.query_name.encode())
            mut_seq = cmn.stream_cipher(alignment.query_sequence,
                                        sha512.digest())

            # change and preserve quality
            # TODO: maybe something else with the quality?
            quality = alignment.query_qualities
            alignment.query_sequence = mut_seq
            alignment.query_qualities = quality
Beispiel #2
0
def check_read_quality(sam_record: pysam.AlignedSegment, run_info):
    """ Process an individual sam read and return quality attributes. """
    read_ID = sam_record.query_name
    flag = sam_record.flag
    cigar = sam_record.cigarstring
    seq = sam_record.query
    read_length = sam_record.query_length
    dataset = sam_record.get_tag('RG')

    # Only use uniquely mapped transcripts
    if flag not in [0, 16]:
        return [dataset, read_ID, 0, 0, read_length, "NA", "NA"]

    # Only use reads that are greater than or equal to length threshold
    if read_length < run_info.min_length:
        return [dataset, read_ID, 0, 1, read_length, "NA", "NA"]

    # Locate the MD field of the sam transcript
    try:
        md_tag = sam_record.get_tag('MD')
    except KeyError:
        raise ValueError("SAM transcript %s lacks an MD tag" % read_ID)

    # Only use reads where alignment coverage and identity exceed
    # cutoffs
    coverage = compute_alignment_coverage(cigar)
    identity = compute_alignment_identity(md_tag, seq)

    if coverage < run_info.min_coverage or \
       identity < run_info.min_identity:
        return [dataset, read_ID, 0, 1, read_length, coverage, identity]

    # At this point, the read has passed the quality control
    return [dataset, read_ID, 1, 1, read_length, coverage, identity]
Beispiel #3
0
def parse_chromium_bamread_metadata(alignment: pysam.AlignedSegment):
    """
    return the readname, error-corrected cellbarvode and error corrected UMI"
    according to:
    'https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam'
    """
    cellbarcode = alignment.get_tag('CB')
    umi = alignment.get_tag('UB')
    readname = alignment.query_name
    return readname, cellbarcode, umi
Beispiel #4
0
    def _new_rec(self, name: str, chrom: str, start: int,
                 attrs: Optional[Dict[str, Any]]) -> AlignedSegment:
        """Generates a new AlignedSegment.  Sets the segment up with the correct
        header and adds the RG attribute if not contained in attrs.

        Args:
            name: the name of the read/template
            chrom: the chromosome to which the read is mapped
            start: the start position of the read on the chromosome
            attrs: an optional dictionary of SAM attributes with two-char keys

        Returns:
            AlignedSegment: an aligned segment with name, chrom, pos, attributes the
                read group, and the unmapped flag all set appropriately.
        """
        if chrom is not sam.NO_REF_NAME and chrom not in self._seq_lookup:
            raise ValueError(
                f"{chrom} is not a valid chromosome name in this builder.")

        rec = AlignedSegment(header=self._samheader)
        rec.query_name = name
        rec.reference_name = chrom
        rec.reference_start = start
        rec.mapping_quality = self.mapping_quality

        if chrom == sam.NO_REF_NAME or start == sam.NO_REF_POS:
            rec.is_unmapped = True

        attrs = attrs if attrs else dict()
        if "RG" not in attrs:
            attrs["RG"] = self.rg_id()
        rec.set_tags(list(attrs.items()))
        return rec
Beispiel #5
0
def test_sam_alignment_to_padded_alignment():
    alignment = AlignedSegment()
    alignment.reference_start = 0
    alignment.query_sequence = 'AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG'
    alignment.cigartuples = ((0,10), (2,1), (0,25))
    ref = Reference('test', 'AGCTTAGCTAAGCTACCTATATCTTGGTCTTGGCCG')

    (pad_ref, pad_match, pad_query) = sam_alignment_to_padded_alignment(alignment, ref)

    assert pad_ref == 'AGCTTAGCTAAGCTACCTATATCTTGGTCTTGGCCG'
    assert pad_match == '|||||||||| |||||||||||||||||||||||||'
    assert pad_query == 'AGCTTAGCTA-GCTACCTATATCTTGGTCTTGGCCG'
Beispiel #6
0
def set_qc_fail(rec: pysam.AlignedSegment, tool: Callable[..., Any],
                reason: str) -> None:
    """Sets the QC fail flag, and adds tags containing the tool name and reason for failing.
    Args:
        rec: the record to fail
        tool: the tool (as a callable) that failed this record
        reason: the reason for failing
    """
    assert '\t' not in reason, f"Reason may not contain tabs: {reason}"
    rec.is_qcfail = True
    rec.set_tag(QcFailToolTag, tool.__name__)
    rec.set_tag(QcFailReasonTag, reason)
Beispiel #7
0
    def from_aligned_segment(cls,
                             align: pysam.AlignedSegment) -> "AlignmentRecord":
        """Extract information from a pysam Aligned segment"""
        read_name, read_idx, align_idx = align.query_name.split(":")
        read_idx, align_idx = int(read_idx), int(align_idx)

        if align.is_unmapped:
            align_cat = "unmapped"
            chrom, start, end, align_score = "NULL", 0, 0, 0
            read_length = align.query_length
            quals = align.query_qualities
            # TODO: handle this more gracefully
            if quals is None:
                align_base_qscore = 0
            else:
                align_base_qscore = mean_qscore(np.array(
                    align.query_qualities))
        else:
            chrom, start, end = (align.reference_name, align.reference_start,
                                 align.reference_end)
            read_length = align.infer_read_length()
            align_score = align.get_tag("AS")
            align_base_qscore = mean_qscore(
                np.array(align.query_alignment_qualities))
            if align.is_secondary:
                align_cat = "secondary"
            elif align.is_supplementary:
                align_cat = "supplementary"
            else:
                align_cat = "primary"

        optional = {}
        for key, tag in [("haplotype", "HP"), ("phase_set", "PS"),
                         ("phase_qual", "PC")]:
            if align.has_tag(tag):
                optional[key] = int(align.get_tag(tag))
        return cls(
            read_idx=read_idx,
            align_idx=align_idx,
            align_type=align_cat,
            chrom=chrom,
            start=start,
            end=end,
            strand=not align.is_reverse,
            read_name=read_name,
            read_length=read_length,
            read_start=align.query_alignment_start,
            read_end=align.query_alignment_end,
            mapping_quality=align.mapq,
            align_score=align_score,
            align_base_qscore=np.rint(align_base_qscore),
            **optional,
        )
Beispiel #8
0
def breakpoint_pos(read: pysam.AlignedSegment, orient: str = ORIENT.NS) -> int:
    """
    assumes the breakpoint is the position following softclipping on the side with more
    softclipping (unless and orientation has been specified)

    Args:
        read: the read object
        orient: the orientation

    Returns:
        the position of the breakpoint in the input read
    """
    typ, freq = read.cigar[0]
    end_typ, end_freq = read.cigar[-1]
    ORIENT.enforce(orient)

    if typ != CIGAR.S and end_typ != CIGAR.S:
        raise AttributeError(
            'cannot compute breakpoint for a read without soft-clipping', read.cigar
        )

    if orient == ORIENT.NS:
        if (
            (typ == CIGAR.S and end_typ == CIGAR.S and freq > end_freq)
            or typ == CIGAR.S
            and end_typ != CIGAR.S
        ):
            orient = ORIENT.RIGHT
            # soft clipped to the left
        else:
            # soft clipped to the right
            orient = ORIENT.LEFT

    if orient == ORIENT.RIGHT:
        if typ != CIGAR.S:
            raise AttributeError(
                'soft clipping doesn\'t support input orientation for a breakpoint',
                repr(orient),
                read.cigar,
                read.get_tags(),
            )
        return read.reference_start
    else:
        if end_typ != CIGAR.S:
            raise AttributeError(
                'soft clipping doesn\'t support input orientation for a breakpoint',
                orient,
                read.cigar,
                read.get_tags(),
            )
        return read.reference_end - 1
Beispiel #9
0
def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]:
    """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set.

    If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be
    returned.  Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed.

    Args:
        rec: the record to fail
    """
    if not rec.is_qcfail or not rec.has_tag(QcFailToolTag):
        return None
    else:
        tool_value = rec.get_tag(QcFailToolTag)
        reason_value = rec.get_tag(QcFailReasonTag)
        return (tool_value, reason_value)
Beispiel #10
0
def softclip_end_of_alignment_by_query(
        rec: AlignedSegment,
        bases_to_clip: int,
        clipped_base_quality: Optional[int] = None,
        tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE
) -> ClippingInfo:
    """
    Adds soft-clipping to the end of a read's alignment.

    Clipping is applied before any existing hard or soft clipping.  E.g. a read with cigar 100M5S
    that is clipped with bases_to_clip=10 will yield a cigar of 90M15S.

    If the read is unmapped or bases_to_clip < 1 then nothing is done.

    If the read has fewer clippable bases than requested the read will be unmapped.

    Args:
        rec: the BAM record to clip
        bases_to_clip: the number of additional bases of clipping desired in the read/query
        clipped_base_quality: if not None, set bases in the clipped region to this quality
        tags_to_invalidate: the set of extended attributes to remove upon clipping

    Returns:
        ClippingInfo: a named tuple containing the number of query/read bases and the number
            of target/reference bases clipped.
    """
    if rec.is_unmapped or bases_to_clip < 1:
        return ClippingInfo(0, 0)

    num_clippable_bases = rec.query_alignment_length

    if bases_to_clip >= num_clippable_bases:
        return _clip_whole_read(rec, tags_to_invalidate)

    # Reverse the cigar and qualities so we can clip from the start
    cigar = Cigar.from_cigartuples(rec.cigartuples).reversed()
    quals = rec.query_qualities
    quals.reverse()
    new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip,
                                     clipped_base_quality)

    # Then reverse everything back again
    quals.reverse()
    rec.query_qualities = quals
    rec.cigarstring = str(new_cigar.reversed())

    _cleanup(rec, tags_to_invalidate)
    return clipping_info
Beispiel #11
0
def get_barcode_for_alignment(alignment: pysam.AlignedSegment, tags: List[str],
                              raise_missing: bool) -> str:
    """ Get the barcode for an Alignment

    :param alignment: pysam.AlignedSegment
        An Alignment from pysam.
    :param tags: List[str]
        Tags in the bam that might contain barcodes. If multiple Tags are passed, will
        return the contents of the first tag that contains a barcode.
    :param raise_missing: bool
        Raise an error if no barcodes can be found.
    :return: str
        A barcode for the alignment, or None if one is not found and raise_missing is False.
    """
    alignment_barcode = None
    for tag in tags:
        # The non-existent barcode should be the exceptional case, so try/except is faster than if/else
        try:
            alignment_barcode = alignment.get_tag(tag)
            break  # Got the key, don't bother getting the next tag
        except KeyError:
            continue  # Try to get the next tag

    if raise_missing and alignment_barcode is None:
        raise RuntimeError(
            "Alignment encountered that is missing {} tag(s).".format(tags))

    return alignment_barcode
 def aggregate(self, record: pysam.AlignedSegment):
     if self.maxMapQ < record.mapping_quality:
         self.maxMapQ = record.mapping_quality
     try:
         startPos = record.get_tag('OS')  # type: int
     except KeyError:
         startPos = record.reference_start
     recordItr = CigarIterator(record)
     recordItr.skipClipped()
     i = startPos - record.reference_start
     while recordItr.valid:
         if len(self.cols) <= i:
             op = self.Op(recordItr.op, recordItr.seqBase)
             op += recordItr.baseQual or 0
             pos = {}
             pos[(op.op, op.allele)] = op
             self.cols.append(pos)
         else:
             op = self.cols[i].get((recordItr.op, recordItr.seqBase))
             if op:
                 op += recordItr.baseQual or 0
             else:
                 self.cols[i][(recordItr.op, recordItr.seqBase)] = self.Op(
                     recordItr.op, recordItr.seqBase)
         i += 1
         recordItr.next()
     self.members.append(record.query_name)
Beispiel #13
0
def get_introns(sam_record: pysam.AlignedSegment, start, cigar):
    """ Locates the jI field in a list of SAM fields or computes
        it from the CIGAR string and start position if it isn't found.
        Note that positions refer to start and endpoints of introns, not exons,
        so adjustments are needed to avoid an off-by-one error if you want exons.

        Example jI strings:
            no introns: jI:B:i,-1
            two introns: jI:B:i,167936516,167951806,167951862,167966628
        Args:
            sam_record: a pysam AlignedSegment
            start: The start position of the transcript with respect to the
            forward strand
            cigar: SAM CIGAR string describing match operations to the reference
            genome
        Returns:
            intron_list: intron starts and ends in a list (sorted order)
    """
    try:
        intron_list = sam_record.get_tag("jI").tolist()
    except KeyError:
        jI = compute_jI(start, cigar)
        intron_list = [int(x) for x in jI.split(",")[1:]]

    if intron_list[0] == -1:
        return []
    else:
        return intron_list
    def test_recordShouldBeFilteredOut_maskSpannedByRecordReturnsTrue(self, *mock):
        masker = Masker(tree=IntervalTree([Interval(10, 20, "chrom1")]))
        record = AlignedSegment()

        actual = masker.record_should_be_filtered_out(record)
        expected = True

        assert actual == expected
    def test_recordShouldBeFilteredOut_recordDoesNotOverlapReturnsFalse(self, *mock):
        masker = Masker(tree=IntervalTree([Interval(10, 20, "chrom1")]))
        record = AlignedSegment()

        actual = masker.record_should_be_filtered_out(record)
        expected = False

        assert actual == expected
Beispiel #16
0
 def assess_alignment(alignment: pysam.AlignedSegment, alignment_info: Dict):
     """ Compare alignment against reference alignment"""
     chrom_match = alignment.reference_name == alignment_info['chrom']
     # assess reference bases that match between the two reads
     matching_pos = np.array(alignment.get_reference_positions(full_length=False))
     base_range = (matching_pos >= alignment_info['start']) & (matching_pos <= alignment_info['end'])
     matching_prop = sum(base_range) / len(alignment_info['cigar'])
     return chrom_match, matching_prop
def is_snp_called_correctly(record: pysam.AlignedSegment) -> bool:
    for query_pos, ref_pos, ref_base in record.get_aligned_pairs(
            with_seq=True):
        if ref_pos == 100:
            if ref_base.islower():
                return False
            else:
                return True
Beispiel #18
0
def merge_annotated_clusters(
        biggest: pysam.AlignedSegment,
        other: pysam.AlignedSegment) -> pysam.AlignedSegment:
    """Merges 2 annotated clusters together.

    Merges 2 annotated aligned segments, each representing a cluster. Merges the
    smaller into the larger. Adds the read number of the 2nd cluster to the first.

    Args:
        biggest: The larger of the 2 clusters, with a higher read number.
        other: The smaller of the 2 clusters, with a lower read number.

    Returns:
        The annotated aligned segment representing the merged cluster.
    """

    merged_id = biggest.get_tag(CLUSTER_ID_TAG)
    if not merged_id.endswith("+"):
        merged_id = merged_id + "+"
    biggest.set_tag(CLUSTER_ID_TAG, merged_id, "Z")

    total_reads = biggest.get_tag(NUM_READS_TAG) + other.get_tag(NUM_READS_TAG)
    biggest.set_tag(NUM_READS_TAG, total_reads, "i")

    return biggest
Beispiel #19
0
    def _set_length_dependent_fields(
        self,
        rec: pysam.AlignedSegment,
        length: int,
        bases: Optional[str] = None,
        quals: Optional[List[int]] = None,
        cigar: Optional[str] = None,
    ) -> None:
        """Fills in bases, quals and cigar on a record.

        If any of bases, quals or cigar are defined, they must all have the same length/query
        length.  If none are defined then the length parameter is used.  Undefined values are
        synthesize at the inferred length.

        Args:
            rec: a SAM record
            length: the length to use if all of bases/quals/cigar are None
            bases: an optional string of bases for the read
            quals: an optional list of qualities for the read
            cigar: an optional cigar string for the read
        """

        # Do some validation to make sure all defined things have the same lengths
        lengths = set()
        if bases is not None:
            lengths.add(len(bases))
        if quals is not None:
            lengths.add(len(quals))
        if cigar is not None:
            cig = sam.Cigar.from_cigarstring(cigar)
            lengths.add(sum([elem.length_on_query for elem in cig.elements]))

        if not lengths:
            lengths.add(length)

        if len(lengths) != 1:
            raise ValueError(
                "Provided bases/quals/cigar are not length compatible.")

        # Fill in the record, making any parts that were not defined as params
        length = lengths.pop()
        rec.query_sequence = bases if bases else self._bases(length)
        rec.query_qualities = quals if quals else [self.base_quality] * length
        if not rec.is_unmapped:
            rec.cigarstring = cigar if cigar else f"{length}M"
def record_contains_expected_snp(record: pysam.AlignedSegment) -> bool:
    expected_base = record.query_name[-1]

    for query_pos, ref_pos, ref_base in record.get_aligned_pairs(
            with_seq=True):
        if query_pos == REF_PANEL_FLANK_WIDTH:
            return expected_base == ref_base

    return False
Beispiel #21
0
def get_tag_or_default(alignment: pysam.AlignedSegment,
                       tag_key: str,
                       default: Optional[str] = None) -> Optional[str]:
    """Extracts the value associated to `tag_key` from `alignment`, and returns a default value
    if the tag is not present."""
    try:
        return alignment.get_tag(tag_key)
    except KeyError:
        return default
Beispiel #22
0
def set_pair_info(r1: AlignedSegment,
                  r2: AlignedSegment,
                  proper_pair: bool = True) -> None:
    """Resets mate pair information between reads in a pair. Requires that both r1
    and r2 are mapped.  Can be handed reads that already have pairing flags setup or
    independent R1 and R2 records that are currently flagged as SE reads.

    Args:
        r1: read 1
        r2: read 2 with the same queryname as r1
    """
    assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
    assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
    assert r1.query_name == r2.query_name, f"Attempting to pair reads with different qnames."

    for r in [r1, r2]:
        r.is_paired = True
        r.is_proper_pair = proper_pair

    r1.is_read1 = True
    r1.is_read2 = False
    r2.is_read2 = True
    r2.is_read1 = False

    for src, dest in [(r1, r2), (r2, r1)]:
        dest.next_reference_id = src.reference_id
        dest.next_reference_start = src.reference_start
        dest.mate_is_reverse = src.is_reverse
        dest.mate_is_unmapped = False
        dest.set_tag("MC", src.cigarstring)

    insert_size = isize(r1, r2)
    r1.template_length = insert_size
    r2.template_length = -insert_size
Beispiel #23
0
 def select_snps_from_single_read(
         read: pysam.AlignedSegment,
         snp_positions: Set[int],
         region_start: int
 ) -> Tuple[List[int], List[str]]:
     positions, nucls = [], []
     # TODO use indels
     seq = read.query_sequence
     for read_pos, ref_pos in read.get_aligned_pairs(matches_only=True):
         ref_pos = ref_pos - region_start
         if ref_pos in snp_positions:
             positions.append(ref_pos)
             nucls.append(SNP.process_nucl(seq[read_pos]))
     return positions, nucls
Beispiel #24
0
def sam_string_to_aligned_segment(sam_string, header=None):
    """Convert a correctly formatted sam string into a pysam AlignedSegment object

    :param sam_string: correctly formatted SAM string
    :param header: AlignmentHeader object

    :return AlignedSegment
    """
    if not header:
        header = AlignmentHeader.from_references([sam_string.split("\t")[2]],
                                                 [100000000])

    new_segment = AlignedSegment.fromstring(sam_string, header)

    return new_segment
Beispiel #25
0
    def _set_flags(self, rec: pysam.AlignedSegment, is_r1: bool,
                   strand: str) -> None:
        """Appropriately sets most flag fields on the given read.

        Args:
            rec: the read to set the flags on
            is_r1: True if the read is a R1, False if it is an R2
            strand: Either "+" or "-" to indicate strand of the read
        """
        rec.is_paired = True
        rec.is_read1 = is_r1
        rec.is_read2 = not is_r1
        rec.is_qcfail = False
        rec.is_duplicate = False
        rec.is_secondary = False
        rec.is_supplementary = False
        if not rec.is_unmapped:
            rec.is_reverse = strand != "+"
Beispiel #26
0
def _read_pos_at_ref_pos(rec: AlignedSegment,
                         ref_pos: int,
                         previous: Optional[bool] = None) -> Optional[int]:
    """
    Returns the read or query position at the reference position.

    If the reference position is not within the span of reference positions to which the
    read is aligned an exception will be raised.  If the reference position is within the span
    but is not aligned (i.e. it is deleted in the read) behavior is controlled by the
    "previous" argument.

    Args:
        rec: the AlignedSegment within which to find the read position
        ref_pos: the reference position to be found
        previous: Controls behavior when the reference position is not aligned to any
            read position.  True indicates to return the previous read position, False
            indicates to return the next read position and None indicates to return None.

    Returns:
        The read position at the reference position, or None.
    """
    if ref_pos < rec.reference_start or ref_pos >= rec.reference_end:
        raise ValueError(
            f"{ref_pos} is not within the reference span for read {rec.query_name}"
        )

    pairs = rec.get_aligned_pairs()
    index = 0
    read_pos = None
    for read, ref in pairs:
        if ref == ref_pos:
            read_pos = read
            break
        else:
            index += 1

    if not read_pos and previous is not None:
        if previous:
            while read_pos is None and index > 0:
                index -= 1
                read_pos = pairs[index][0]
        else:
            while read_pos is None and index < len(pairs):
                read_pos = pairs[index][0]
                index += 1

    return read_pos
Beispiel #27
0
    def run(self):
        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, res in self.iterator:

                seq = res['sequence']
                qstring = res.get('qstring', '*')
                mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = res.get('mapping', False)
                mods_tags = res.get('mods', [])

                if self.duplex:
                    samples = len(read[0].signal) + len(read[1].signal)
                    read_id = '%s;%s' % (read[0].read_id, read[1].read_id)
                else:
                    samples = len(read.signal)
                    read_id = read.read_id

                tags = [
                    f'RG:Z:{read.run_id}_{self.group_key}',
                    f'qs:i:{round(mean_qscore)}',
                    *read.tagdata(),
                    *mods_tags,
                ]

                if len(seq):
                    if self.mode == 'wfq':
                        write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags)
                    else:
                        self.output.write(
                            AlignedSegment.fromstring(
                                sam_record(read_id, seq, qstring, mapping, tags=tags),
                                self.output.header
                            )
                        )
                    if self.duplex:
                        summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping))
                    else:
                        summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                    self.log.append((read_id, samples))

                else:
                    logger.warn("> skipping empty sequence %s", read_id)
Beispiel #28
0
 def _ref_pos2seq_pos(alignment: pysam.AlignedSegment, ref_pos: int) -> int:
     """
     Retrieve base position in sequence string at refence position.
     Alignment and ref_pos are assumed to be of the same reference.
     :param alignment: pysam.AlignedSegment
     :param ref_pos: reference position of base
     :return: AlignedSegment.query_sequence position matched to ref_pos.
     None is returned if matching position is not found.
     """
     # TODO optimalize: (try matches_only=True)
     # TODO optimalize: case when alignment is full matched based on CIGAR (e.g. 30M)
     
     seq_pos = None
     for current_seq_pos, current_ref_pos in alignment.get_aligned_pairs(matches_only=False, with_seq=False):
         # search for base in snv position
         if current_ref_pos == ref_pos:
             seq_pos = current_seq_pos
             break
     
     return seq_pos
Beispiel #29
0
def _cleanup(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> None:
    """Removes extended tags from a record that may have become invalid after clipping."""
    for tag in tags_to_invalidate:
        rec.set_tag(tag, None)
Beispiel #30
0
def _make_read_unmapped(rec: AlignedSegment) -> None:
    """Removes mapping information from a read."""
    if rec.is_reverse:
        quals = rec.query_qualities
        quals.reverse()
        rec.query_sequence = dnautils.reverse_complement(rec.query_sequence)
        rec.query_qualities = quals
        rec.is_reverse = False

    rec.reference_id = sam.NO_REF_INDEX
    rec.reference_start = sam.NO_REF_POS
    rec.cigar = None
    rec.mapping_quality = 0
    rec.template_length = 0
    rec.is_duplicate = False
    rec.is_secondary = False
    rec.is_supplementary = False
    rec.is_proper_pair = False
    rec.is_unmapped = True
Beispiel #31
0
            AEQ(BZ.hqRegion, VZ.hqRegion)
            AEQ(BZ.insertRegions, VZ.insertRegions)

    def testRead(self):
        BZR = self.BZ.read()
        VZR = self.VZ.read()
        EQ(BZR.basecalls(), VZR.basecalls())


# Mockup some bam records reflecting the internal "pulse BAM" spec
from mock import Mock
from pysam import AlignedSegment
from pbcore.io import BamAlignment
from PRmm.io.ZmwReadStitcherIO import StitchedZmw, FeatureDesc

pulsePeer = AlignedSegment()
pulsePeer.is_unmapped=True
pulsePeer.seq = "GATTACAGATTACA"
pulsePeer.qname = "FakePulseRead"
tags = dict(
    RG="00000000",
    np=1,
    qs=0,
    qe=14,
    rq=0.80,
    sn=[2.0, 3.0, 5.0, 6.0],
    ip=[15]*14,
    pw=[16]*14,
    zm=42,
    cx=2,
    # Now, the pulse stuff