Ejemplo n.º 1
0
 def __lt__(lhs, rhs):
     assert type(lhs) == type(
         rhs), "Cannot compare coordinates of different representations"
     if lhs.uncertain or rhs.uncertain:
         raise HGVSUnsupportedOperationError(
             "Cannot compare coordinates of uncertain positions")
     return lhs.base < rhs.base
Ejemplo n.º 2
0
 def __lt__(lhs, rhs):
     assert type(lhs) == type(rhs), "Cannot compare coordinates of different representations"
     if lhs.uncertain or rhs.uncertain:
         raise HGVSUnsupportedOperationError("Cannot compare coordinates of uncertain positions")
     if lhs.datum == rhs.datum:
         if lhs.base == rhs.base:
             return lhs.offset < rhs.offset
         else:
             if ((rhs.base - lhs.base == 1 and lhs.offset > 0 and rhs.offset < 0)
                     or (lhs.base - rhs.base == 1 and rhs.offset > 0 and lhs.offset < 0)):
                 raise HGVSUnsupportedOperationError(
                     "Cannot compare coordinates in the same intron with one based on end of exon and the other based on start of next exon"
                 )
             else:
                 return lhs.base < rhs.base
     else:
         if lhs.datum == Datum.SEQ_START or rhs.datum == Datum.SEQ_START:
             raise HGVSUnsupportedOperationError(
                 "Cannot compare coordinates of datum SEQ_START with CDS_START or CDS_END")
         else:
             return lhs.datum < rhs.datum
Ejemplo n.º 3
0
    def _replace_reference(self, var):
        """fetch reference sequence for variant and update (in-place) if necessary"""

        if var.type not in "cgmnr":
            raise HGVSUnsupportedOperationError("Can only update references for type c, g, m, n, r")

        if var.posedit.edit.type in ("ins", "con"):
            # these types have no reference sequence (zero-width), so return as-is
            return var

        pos = var.posedit.pos
        if ((isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0)
                or (isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0)):
            _logger.info("Can't update reference sequence for intronic variant {}".format(var))
            return var

        # For c. variants, we need coords on underlying sequences
        if var.type == "c":
            mapper = self._fetch_AlignmentMapper(
                tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript")
            pos = mapper.c_to_n(var.posedit.pos)
        else:
            pos = var.posedit.pos

        seq_start = pos.start.base - 1
        seq_end = pos.end.base
        
        # When strict_bounds is False and an error occurs, return
        # variant as-is

        if seq_start < 0:
            # this is an out-of-bounds variant
            return var

        seq = self.hdp.get_seq(var.ac, seq_start, seq_end)

        if len(seq) != seq_end - seq_start:
            # tried to read beyond seq end; this is an out-of-bounds variant
            return var

        edit = var.posedit.edit
        if edit.ref != seq:
            _logger.debug("Replaced reference sequence in {var} with {seq}".format(
                var=var, seq=seq))
            edit.ref = seq

        return var
Ejemplo n.º 4
0
    def _replace_reference(self, var):
        """fetch reference sequence for variant and update (in-place) if necessary"""

        if var.type not in "cgmnr":
            raise HGVSUnsupportedOperationError("Can only update references for type c, g, m, n, r")

        if var.posedit.edit.type in ("ins", "con"):
            # these types have no reference sequence (zero-width), so return as-is
            return var

        pos = var.posedit.pos
        if ((isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0)
                or (isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0)):
            _logger.info("Can't update reference sequence for intronic variant {}".format(var))
            return var

        # For c. variants, we need coords on underlying sequences
        if var.type == "c":
            mapper = self._fetch_AlignmentMapper(
                tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript")
            pos = mapper.c_to_n(var.posedit.pos)
        else:
            pos = var.posedit.pos

        seq_start = pos.start.base - 1
        seq_end = pos.end.base
        
        # When strict_bounds is False and an error occurs, return
        # variant as-is

        try:
            seq = self.hdp.get_seq(var.ac, seq_start, seq_end)
        except HGVSDataNotAvailableError as e:
            if (seq_start < 0 or len(seq) != seq_end - seq_start):
                assert not hgvs.global_config.mapping.strict_bounds, f"{var}: Got out of bounds variant with strict_bounds enabled"
                assert var.type in "cnr", f"Should not see out of bounds variant on type {var.type}"
                _logger.info(f"{var}: variant outside sequence bounds; reference sequence can't be validated")
                return var

        edit = var.posedit.edit
        if edit.ref != seq:
            _logger.debug("Replaced reference sequence in {var} with {seq}".format(
                var=var, seq=seq))
            edit.ref = seq

        return var
Ejemplo n.º 5
0
    def _replace_reference(self, var):
        """fetch reference sequence for variant and update (in-place) if necessary"""

        if var.type not in "cgmnr":
            raise HGVSUnsupportedOperationError(
                "Can only update references for type c, g, m, n, r")

        if var.posedit.edit.type == "ins":
            # insertions have no reference sequence (zero-width), so return as-is
            return var
        if var.posedit.edit.type == "con":
            # conversions have no reference sequence (zero-width), so return as-is
            return var

        pos = var.posedit.pos
        if ((isinstance(pos.start, hgvs.location.BaseOffsetPosition)
             and pos.start.offset != 0)
                or (isinstance(pos.end, hgvs.location.BaseOffsetPosition)
                    and pos.end.offset != 0)):
            _logger.info(
                "Can't update reference sequence for intronic variant {}".
                format(var))
            return var

        # For c. variants, we need coords on underlying sequences
        if var.type == "c":
            tm = self._fetch_AlignmentMapper(tx_ac=var.ac,
                                             alt_ac=var.ac,
                                             alt_aln_method="transcript")
            pos = tm.c_to_n(var.posedit.pos)
        else:
            pos = var.posedit.pos
        seq = self.hdp.get_seq(var.ac, pos.start.base - 1, pos.end.base)

        edit = var.posedit.edit
        if edit.ref != seq:
            _logger.debug(
                "Replaced reference sequence in {var} with {seq}".format(
                    var=var, seq=seq))
            edit.ref = seq

        return var
Ejemplo n.º 6
0
 def _del_ins_lengths(self, ilen):
     raise HGVSUnsupportedOperationError(
         "internal function _del_ins_lengths not implemented for this variant type"
     )
Ejemplo n.º 7
0
    def normalize(self, var):
        """Perform sequence variants normalization for single variant
        """
        assert isinstance(
            var, hgvs.sequencevariant.SequenceVariant
        ), "variant must be a parsed HGVS sequence variant object"

        if self.validator:
            self.validator.validate(var)

        if var.posedit is None or var.posedit.uncertain or var.posedit.pos is None:
            return var

        type = var.type

        if type == "p":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of protein level variants: {0}".
                format(var))
        if var.posedit.edit.type == "con":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of conversion variants: {0}",
                format(var))

        var.fill_ref(self.hdp)

        if var.posedit.edit.type == "identity":
            var_norm = copy.deepcopy(var)
            return var_norm

        # For c. variants normalization, first convert to n. variant
        # and perform normalization at the n. level, then convert the
        # normalized n. variant back to c. variant.
        if type == "c":
            var = self.hm.c_to_n(var)

        if var.type in "nr":
            if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0:
                raise HGVSUnsupportedOperationError(
                    "Normalization of intronic variants is not supported")

        # g, m, n, r sequences all use sequence start as the datum
        # That"s an essential assumption herein
        # (this is why we may have converted from c to n above)
        assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r"

        bound_s, bound_e = self._get_boundary(var)
        boundary = (bound_s, bound_e)
        start, end, (ref, alt) = self._normalize_alleles(var, boundary)

        ref_len = len(ref)
        alt_len = len(alt)

        # Generate normalized variant
        if alt_len == ref_len:
            ref_start = start
            ref_end = end - 1
            # inversion
            if ref_len > 1 and ref == reverse_complement(alt):
                edit = hgvs.edit.Inv(ref=ref)
            # ident
            elif ref_len == 0 and alt_len == 0:
                ref_start = ref_end
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            # substitution or delins
            else:
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
        if alt_len < ref_len:
            # del or delins
            ref_start = start
            ref_end = end - 1
            edit = hgvs.edit.NARefAlt(ref=ref,
                                      alt=None if alt_len == 0 else alt)
        elif alt_len > ref_len:
            # ins or dup
            if ref_len == 0:
                if self.shuffle_direction == 3:
                    adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1,
                                                      end - 1, 0, boundary)
                else:
                    adj_seq = self._fetch_bounded_seq(var, start - 1,
                                                      start + alt_len - 1, 0,
                                                      boundary)
                # ins
                if alt != adj_seq:
                    ref_start = start - 1
                    ref_end = end
                    edit = hgvs.edit.NARefAlt(ref=None, alt=alt)
                # dup
                else:
                    if self.shuffle_direction == 3:
                        ref_start = start - alt_len
                        ref_end = end - 1
                        edit = hgvs.edit.Dup(ref=alt)
                    else:
                        ref_start = start
                        ref_end = start + alt_len - 1
                        edit = hgvs.edit.Dup(ref=alt)
            # delins
            else:
                ref_start = start
                ref_end = end - 1
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)

        # ensure the start is not 0
        if ref_start == 0:
            ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary)
            alt = alt + ref
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = 1
            ref_end = 1

        # ensure the end is not outside of reference sequence
        tgt_len = self._get_tgt_length(var)
        if ref_end == tgt_len + 1:
            ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0,
                                          boundary)
            alt = ref + alt
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = tgt_len
            ref_end = tgt_len

        var_norm = copy.deepcopy(var)
        var_norm.posedit.edit = edit
        var_norm.posedit.pos.start.base = ref_start
        var_norm.posedit.pos.end.base = ref_end

        if type == "c":
            var_norm = self.hm.n_to_c(var_norm)

        return var_norm
Ejemplo n.º 8
0
    def _get_boundary(self, var):
        """Get the position of exon-intron boundary for current variant
        """
        if var.type == "r" or var.type == "n":
            if self.cross_boundaries:
                return 0, float("inf")
            else:
                # Get genomic sequence access number for this transcript
                map_info = self.hdp.get_tx_mapping_options(var.ac)
                if not map_info:
                    raise HGVSDataNotAvailableError(
                        "No mapping info available for {ac}".format(ac=var.ac))
                map_info = [
                    item for item in map_info
                    if item["alt_aln_method"] == self.alt_aln_method
                ]
                alt_ac = map_info[0]["alt_ac"]

                # Get tx info
                tx_info = self.hdp.get_tx_info(var.ac, alt_ac,
                                               self.alt_aln_method)
                cds_start = tx_info["cds_start_i"]
                cds_end = tx_info["cds_end_i"]

                # Get exon info
                exon_info = self.hdp.get_tx_exons(var.ac, alt_ac,
                                                  self.alt_aln_method)
                exon_starts = [exon["tx_start_i"] for exon in exon_info]
                exon_ends = [exon["tx_end_i"] for exon in exon_info]
                exon_starts.sort()
                exon_ends.sort()
                exon_starts.append(exon_ends[-1])
                exon_ends.append(float("inf"))

                # Find the end pos of the exon where the var locates
                left = 0
                right = float("inf")

                # TODO: #242: implement methods to find tx regions
                for i in range(0, len(exon_starts)):
                    if (var.posedit.pos.start.base - 1 >= exon_starts[i]
                            and var.posedit.pos.start.base - 1 < exon_ends[i]):
                        break

                for j in range(0, len(exon_starts)):
                    if (var.posedit.pos.end.base - 1 >= exon_starts[j]
                            and var.posedit.pos.end.base - 1 < exon_ends[j]):
                        break

                if i != j:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-intron boundary ({var})"
                        .format(var=var))

                left = exon_starts[i]
                right = exon_ends[i]

                if cds_start is None:
                    pass
                elif var.posedit.pos.end.base - 1 < cds_start:
                    right = min(right, cds_start)
                elif var.posedit.pos.start.base - 1 >= cds_start:
                    left = max(left, cds_start)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the UTR-exon boundary ({var})"
                        .format(var=var))

                if cds_end is None:
                    pass
                elif var.posedit.pos.start.base - 1 >= cds_end:
                    left = max(left, cds_end)
                elif var.posedit.pos.end.base - 1 < cds_end:
                    right = min(right, cds_end)
                else:
                    raise HGVSUnsupportedOperationError(
                        "Unsupported normalization of variants spanning the exon-UTR boundary ({var})"
                        .format(var=var))

                return left, right
        else:
            # For variant type of g and m etc.
            return 0, float("inf")
Ejemplo n.º 9
0
    def normalize(self, var):
        """Perform sequence variants normalization for single variant
        """
        assert isinstance(
            var, hgvs.sequencevariant.SequenceVariant
        ), "variant must be a parsed HGVS sequence variant object"

        # keep a shallow reference to the original variant, to be returned
        # as-is under certain circumstances
        orig_var = var

        if self.validator:
            self.validator.validate(var)

        init_met = False
        if var.posedit is not None and isinstance(var.posedit,
                                                  hgvs.edit.AARefAlt):
            init_met = var.posedit.init_met

        if var.posedit is None or var.posedit.uncertain or init_met or var.posedit.pos is None:
            return var

        type = var.type

        if type == "p":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of protein level variants: {0}".
                format(var))
        if var.posedit.edit.type == "con":
            raise HGVSUnsupportedOperationError(
                "Unsupported normalization of conversion variants: {0}",
                format(var))

        var.fill_ref(self.hdp)

        if var.posedit.edit.type == "identity":
            var_norm = copy.deepcopy(var)
            return var_norm

        # For c. variants normalization, first convert to n. variant
        # and perform normalization at the n. level, then convert the
        # normalized n. variant back to c. variant.
        if type == "c":
            var = self.vm.c_to_n(var)

        if var.type in "nr":
            if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0:
                raise HGVSUnsupportedOperationError(
                    "Normalization of intronic variants is not supported")

        def is_valid_pos(ac, pos):
            # tests whether the sequence position actually exists
            # This is *way* janky.
            # TODO: push functionality to hdp which can implement differently
            # based on capabilities of sequence backend
            try:
                s = self.hdp.get_seq(ac, pos - 1, pos)  # 0-based!
                return s != ""
            except HGVSDataNotAvailableError as e:
                # Bad Request indicates that we got to NCBI, but the request
                # was invalid.
                return "Bad Request" not in str(e)

        if var.posedit.pos.start.base < 0 or not is_valid_pos(
                var.ac, var.posedit.pos.end.base):
            if hgvs.global_config.mapping.strict_bounds:
                raise HGVSInvalidVariantError(
                    f"{var}: coordinates are out-of-bounds")
            _logger.warning(
                f"{var}: coordinates are out-of-bounds; returning as-is")
            return orig_var

        # restrict var types to those that use sequence start (i.e., not c.)
        assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r"

        bound_s, bound_e = self._get_boundary(var)
        boundary = (bound_s, bound_e)
        start, end, (ref, alt) = self._normalize_alleles(var, boundary)

        ref_len = len(ref)
        alt_len = len(alt)

        # Generate normalized variant
        if alt_len == ref_len:
            ref_start = start
            ref_end = end - 1
            # inversion
            if ref_len > 1 and ref == reverse_complement(alt):
                edit = hgvs.edit.Inv(ref=ref)
            # ident
            elif ref_len == 0 and alt_len == 0:
                ref_start = ref_end
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            # substitution or delins
            else:
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
        if alt_len < ref_len:
            # del or delins
            ref_start = start
            ref_end = end - 1
            edit = hgvs.edit.NARefAlt(ref=ref,
                                      alt=None if alt_len == 0 else alt)
        elif alt_len > ref_len:
            # ins or dup
            if ref_len == 0:
                if self.shuffle_direction == 3:
                    adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1,
                                                      end - 1, 0, boundary)
                else:
                    adj_seq = self._fetch_bounded_seq(var, start - 1,
                                                      start + alt_len - 1, 0,
                                                      boundary)
                # ins
                if alt != adj_seq:
                    ref_start = start - 1
                    ref_end = end
                    edit = hgvs.edit.NARefAlt(ref=None, alt=alt)
                # dup
                else:
                    if self.shuffle_direction == 3:
                        ref_start = start - alt_len
                        ref_end = end - 1
                        edit = hgvs.edit.Dup(ref=alt)
                    else:
                        ref_start = start
                        ref_end = start + alt_len - 1
                        edit = hgvs.edit.Dup(ref=alt)
            # delins
            else:
                ref_start = start
                ref_end = end - 1
                edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)

        # ensure the start is not 0
        if ref_start == 0:
            ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary)
            alt = alt + ref
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = 1
            ref_end = 1

        # ensure the end is not outside of reference sequence
        tgt_len = self._get_tgt_length(var)
        if ref_end == tgt_len + 1:
            ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0,
                                          boundary)
            alt = ref + alt
            edit = hgvs.edit.NARefAlt(ref=ref, alt=alt)
            ref_start = tgt_len
            ref_end = tgt_len

        var_norm = copy.deepcopy(var)
        var_norm.posedit.edit = edit
        var_norm.posedit.pos.start.base = ref_start
        var_norm.posedit.pos.end.base = ref_end

        if type == "c":
            var_norm = self.vm.n_to_c(var_norm)

        return var_norm