def validate(self, var, strict=None): assert isinstance(var, hgvs.sequencevariant.SequenceVariant ), "variant must be a parsed HGVS sequence variant object" if strict is None: strict = self.strict fail_level = ValidationLevel.WARNING if strict else ValidationLevel.ERROR var_n = None if var.type == "n": var_n = var elif var.type == "c": var_n = self.vm.c_to_n(var) if var_n is not None: res, msg = self._n_within_transcript_bounds(var_n) if res != ValidationLevel.VALID: if hgvs.global_config.mapping.strict_bounds: raise HGVSInvalidVariantError(msg) _logger.warning("{}: Variant outside transcript bounds;" " no validation provided".format(var)) return True # no other checking performed res, msg = self._c_within_cds_bound(var) if res >= fail_level: raise HGVSInvalidVariantError(msg) res, msg = self._ref_is_valid(var) if res >= fail_level: raise HGVSInvalidVariantError(msg) return True
def validate(self, var, strict=None): assert isinstance(var, hgvs.sequencevariant.SequenceVariant), "variant must be a parsed HGVS sequence variant object" if strict is None: strict = self.strict fail_level = ValidationLevel.WARNING if strict else ValidationLevel.ERROR (res, msg) = self._ref_is_valid(var) if res >= fail_level: raise HGVSInvalidVariantError(msg) else: (res, msg) = self._c_within_cds_bound(var) if res >= fail_level: raise HGVSInvalidVariantError(msg) return True
def n_to_c(self, var_n): """Given a parsed n. variant, return a c. variant on the specified transcript using the specified alignment method (default is "transcript" indicating a self alignment). :param hgvs.sequencevariant.SequenceVariant var_n: a variant object :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) :raises HGVSInvalidVariantError: if var_n is not of type "n" """ if not (var_n.type == "n"): raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n)) if self._validator: self._validator.validate(var_n) var_n.fill_ref(self.hdp) tm = self._fetch_TranscriptMapper(tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript") pos_c = tm.n_to_c(var_n.posedit.pos) if (isinstance(var_n.posedit.edit, hgvs.edit.NARefAlt) or isinstance(var_n.posedit.edit, hgvs.edit.Dup) or isinstance(var_n.posedit.edit, hgvs.edit.Inv)): edit_c = copy.deepcopy(var_n.posedit.edit) else: raise HGVSUnsupportedOperationError("Only NARefAlt/Dup/Inv types are currently implemented") var_c = hgvs.sequencevariant.SequenceVariant(ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)) if self.replace_reference: self._replace_reference(var_c) return var_c
def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed c. variant, return a g. variant on the specified transcript using the specified alignment method (default is "splign" from NCBI). :param hgvs.sequencevariant.SequenceVariant var_c: a variant object :param str alt_ac: a reference sequence accession (e.g., NC_000001.11) :param str alt_aln_method: the alignment method; valid values depend on data source :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) :raises HGVSInvalidVariantError: if var_c is not of type "c" """ if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) var_c.fill_ref(self.hdp) tm = self._fetch_TranscriptMapper(tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method) pos_g = tm.c_to_g(var_c.posedit.pos) edit_g = self._convert_edit_check_strand(tm.strand, var_c.posedit.edit) var_g = hgvs.sequencevariant.SequenceVariant(ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)) if self.replace_reference: self._replace_reference(var_g) return var_g
def g_to_c(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed g. variant, return a c. variant on the specified transcript using the specified alignment method (default is "splign" from NCBI). :param hgvs.sequencevariant.SequenceVariant var_g: a variant object :param str tx_ac: a transcript accession (e.g., NM_012345.6 or ENST012345678) :param str alt_aln_method: the alignment method; valid values depend on data source :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) using CDS coordinates :raises HGVSInvalidVariantError: if var_g is not of type "g" """ if not (var_g.type == "g"): raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g)) if self._validator: self._validator.validate(var_g) var_g.fill_ref(self.hdp) tm = self._fetch_TranscriptMapper(tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method) pos_c = tm.g_to_c(var_g.posedit.pos) edit_c = self._convert_edit_check_strand(tm.strand, var_g.posedit.edit) var_c = hgvs.sequencevariant.SequenceVariant(ac=tx_ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)) if self.replace_reference: self._replace_reference(var_c) return var_c
def c_to_p(self, var_c, pro_ac=None): """ Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession Author: Rudy Rico :param SequenceVariant var_c: hgvsc tag :param str pro_ac: protein accession :rtype: hgvs.sequencevariant.SequenceVariant """ if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c)) if self._validator: self._validator.validate(var_c) reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) # TODO: handle case where you get 2+ alt sequences back; # currently get list of 1 element loop structure implemented # to handle this, but doesn't really do anything currently. all_alt_data = builder.build_altseq() var_ps = [] for alt_data in all_alt_data: builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data) var_p = builder.build_hgvsp() var_ps.append(var_p) var_p = var_ps[0] if self.add_gene_symbol: self._update_gene_symbol(var_p, var_c.gene) return var_p
def c_to_n(self, var_c): """Given a parsed c. variant, return a n. variant on the specified transcript using the specified alignment method (default is "transcript" indicating a self alignment). :param hgvs.sequencevariant.SequenceVariant var_c: a variant object :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) :raises HGVSInvalidVariantError: if var_c is not of type "c" """ if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) var_c.fill_ref(self.hdp) mapper = self._fetch_AlignmentMapper( tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript") pos_n = mapper.c_to_n(var_c.posedit.pos) if (isinstance(var_c.posedit.edit, hgvs.edit.NARefAlt) or isinstance(var_c.posedit.edit, hgvs.edit.Dup) or isinstance(var_c.posedit.edit, hgvs.edit.Inv)): edit_n = copy.deepcopy(var_c.posedit.edit) else: raise HGVSUnsupportedOperationError( "Only NARefAlt/Dup/Inv types are currently implemented") var_n = hgvs.sequencevariant.SequenceVariant( ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n)) if self.replace_reference: self._replace_reference(var_n) if self.add_gene_symbol: self._update_gene_symbol(var_n, var_c.gene) return var_n
def g_to_c(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed g. variant, return a c. variant on the specified transcript using the specified alignment method (default is "splign" from NCBI). :param hgvs.sequencevariant.SequenceVariant var_g: a variant object :param str tx_ac: a transcript accession (e.g., NM_012345.6 or ENST012345678) :param str alt_aln_method: the alignment method; valid values depend on data source :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) using CDS coordinates :raises HGVSInvalidVariantError: if var_g is not of type "g" """ if not (var_g.type == "g"): raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g)) if self._validator: self._validator.validate(var_g) var_g.fill_ref(self.hdp) tm = self._fetch_AlignmentMapper(tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method) pos_c = tm.g_to_c(var_g.posedit.pos) if not pos_c.uncertain: edit_c = self._convert_edit_check_strand(tm.strand, var_g.posedit.edit) if edit_c.type == 'ins' and pos_c.start.offset == 0 and pos_c.end.offset == 0 and pos_c.end - pos_c.start > 1: pos_c.start.base += 1 pos_c.end.base -= 1 edit_c.ref = '' else: # variant at alignment gap pos_g = tm.c_to_g(pos_c) edit_c = hgvs.edit.NARefAlt(ref='', alt=self._get_altered_sequence(tm.strand, pos_g, var_g)) pos_c.uncertain = var_g.posedit.pos.uncertain var_c = hgvs.sequencevariant.SequenceVariant(ac=tx_ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)) if self.replace_reference: self._replace_reference(var_c) return var_c
def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed n. variant, return a g. variant on the specified transcript using the specified alignment method (default is "splign" from NCBI). :param hgvs.sequencevariant.SequenceVariant var_n: a variant object :param str alt_ac: a reference sequence accession (e.g., NC_000001.11) :param str alt_aln_method: the alignment method; valid values depend on data source :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) :raises HGVSInvalidVariantError: if var_n is not of type "n" """ if not (var_n.type == "n"): raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n)) if self._validator: self._validator.validate(var_n) var_n.fill_ref(self.hdp) tm = self._fetch_AlignmentMapper(tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method) pos_g = tm.n_to_g(var_n.posedit.pos) if not pos_g.uncertain: edit_g = self._convert_edit_check_strand(tm.strand, var_n.posedit.edit) if edit_g.type == 'ins' and pos_g.end - pos_g.start > 1: pos_g.start.base += 1 pos_g.end.base -= 1 edit_g.ref = '' else: # variant at alignment gap pos_n = tm.g_to_n(pos_g) edit_g = hgvs.edit.NARefAlt(ref='', alt=self._get_altered_sequence(tm.strand, pos_n, var_n)) pos_g.uncertain = var_n.posedit.pos.uncertain var_g = hgvs.sequencevariant.SequenceVariant(ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)) if self.replace_reference: self._replace_reference(var_g) return var_g
def g_to_t(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): if not (var_g.type == "g"): raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g)) if self._validator: self._validator.validate(var_g) var_g.fill_ref(self.hdp) tm = self._fetch_TranscriptMapper(tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method) if tm.is_coding_transcript: var_out = VariantMapper.g_to_c(self, var_g=var_g, tx_ac=tx_ac, alt_aln_method=alt_aln_method) else: var_out = VariantMapper.g_to_n(self, var_g=var_g, tx_ac=tx_ac, alt_aln_method=alt_aln_method) return var_out
def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): if var_t.type not in "cn": raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t)) if self._validator: self._validator.validate(var_t) var_t.fill_ref(self.hdp) tm = self._fetch_TranscriptMapper(tx_ac=var_t.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method) if tm.is_coding_transcript: var_out = VariantMapper.c_to_g(self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method) else: var_out = VariantMapper.n_to_g(self, var_n=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method) return var_out
def g_to_n(self, var_g, tx_ac, alt_aln_method=hgvs.global_config.mapping.alt_aln_method): """Given a parsed g. variant, return a n. variant on the specified transcript using the specified alignment method (default is "splign" from NCBI). :param hgvs.sequencevariant.SequenceVariant var_g: a variant object :param str tx_ac: a transcript accession (e.g., NM_012345.6 or ENST012345678) :param str alt_aln_method: the alignment method; valid values depend on data source :returns: variant object (:class:`hgvs.sequencevariant.SequenceVariant`) using transcript (n.) coordinates :raises HGVSInvalidVariantError: if var_g is not of type "g" """ if not (var_g.type == "g"): raise HGVSInvalidVariantError("Expected a g. variant; got " + str(var_g)) if self._validator: self._validator.validate(var_g) mapper = self._fetch_AlignmentMapper( tx_ac=tx_ac, alt_ac=var_g.ac, alt_aln_method=alt_aln_method) if (mapper.strand == -1 and not hgvs.global_config.mapping.strict_bounds and not mapper.g_interval_is_inbounds(var_g.posedit.pos)): _logger.info("Renormalizing out-of-bounds minus strand variant on genomic sequence") var_g = self.left_normalizer.normalize(var_g) var_g.fill_ref(self.hdp) pos_n = mapper.g_to_n(var_g.posedit.pos) if not pos_n.uncertain: edit_n = self._convert_edit_check_strand(mapper.strand, var_g.posedit.edit) if edit_n.type == 'ins' and pos_n.start.offset == 0 and pos_n.end.offset == 0 and pos_n.end - pos_n.start > 1: pos_n.start.base += 1 pos_n.end.base -= 1 edit_n.ref = '' else: # variant at alignment gap pos_g = mapper.n_to_g(pos_n) edit_n = hgvs.edit.NARefAlt( ref='', alt=self._get_altered_sequence(mapper.strand, pos_g, var_g)) pos_n.uncertain = var_g.posedit.pos.uncertain var_n = hgvs.sequencevariant.SequenceVariant( ac=tx_ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n)) if (self.replace_reference and var_n.posedit.pos.start.base >= 0 and var_n.posedit.pos.end.base < mapper.tgt_len): self._replace_reference(var_n) if self.add_gene_symbol: self._update_gene_symbol(var_n, var_g.gene) return var_n
def _fetch_bounded_seq(self, var, start, end, boundary): """Fetch reference sequence from hgvs data provider. The start position is 0 and the interval is half open """ start = start if start >= boundary[0] else boundary[0] end = end if end <= boundary[1] else boundary[1] if start >= end: return "" seq = self.hdp.get_seq(var.ac, start, end) if len(seq) < end - start: raise HGVSInvalidVariantError("Variant span is outside sequence bounds ({var})".format(var=var)) return seq
def t_to_p(self, var_t): """Return a protein variant, or "non-coding" for non-coding variant types CAUTION: Unlike other x_to_y methods that always return SequenceVariant instances, this method returns a string when the variant type is ``n``. This is intended as a convenience, particularly when looping over ``relevant_transcripts``, projecting with ``g_to_t``, then desiring a protein representation for coding transcripts. """ if var_t.type == "n": return "non-coding" if var_t.type == "c": return self.c_to_p(var_t) raise HGVSInvalidVariantError("Expected a coding (c.) or non-coding (n.) variant; got " + str(var_t))
def c_to_p(self, var_c, pro_ac=None): """ Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession Author: Rudy Rico :param SequenceVariant var_c: hgvsc tag :param str pro_ac: protein accession :rtype: hgvs.sequencevariant.SequenceVariant """ @attr.s(slots=True) class RefTranscriptData(object): transcript_sequence = attr.ib() aa_sequence = attr.ib() cds_start = attr.ib() cds_stop = attr.ib() protein_accession = attr.ib() @classmethod def setup_transcript_data(cls, hdp, tx_ac, pro_ac): """helper for generating RefTranscriptData from for c_to_p""" tx_info = hdp.get_tx_identity_info(var_c.ac) tx_seq = hdp.get_seq(tx_ac) if tx_info is None or tx_seq is None: raise HGVSDataNotAvailableError("Missing transcript data for accession: {}".format(tx_ac)) # use 1-based hgvs coords cds_start = tx_info["cds_start_i"] + 1 cds_stop = tx_info["cds_end_i"] # padding list so biopython won't complain during the conversion tx_seq_to_translate = tx_seq[cds_start - 1:cds_stop] if len(tx_seq_to_translate) % 3 != 0: "".join(list(tx_seq_to_translate).extend(["N"] * ((3 - len(tx_seq_to_translate) % 3) % 3))) tx_seq_cds = Seq(tx_seq_to_translate) protein_seq = str(tx_seq_cds.translate()) if pro_ac is None: # get_acs... will always return at least the MD5_ accession pro_ac = (hdp.get_pro_ac_for_tx_ac(tx_ac) or hdp.get_acs_for_protein_seq(protein_seq)[0]) transcript_data = RefTranscriptData(tx_seq, protein_seq, cds_start, cds_stop, pro_ac) return transcript_data if not (var_c.type == "c"): raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c)) if self._validator: self._validator.validate(var_c) reference_data = RefTranscriptData.setup_transcript_data(self.hdp, var_c.ac, pro_ac) builder = altseqbuilder.AltSeqBuilder(var_c, reference_data) # TODO: handle case where you get 2+ alt sequences back; # currently get list of 1 element loop structure implemented # to handle this, but doesn't really do anything currently. all_alt_data = builder.build_altseq() var_ps = [] for alt_data in all_alt_data: builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data) var_p = builder.build_hgvsp() var_ps.append(var_p) var_p = var_ps[0] return var_p
def normalize(self, var): """Perform sequence variants normalization for single variant """ assert isinstance( var, hgvs.sequencevariant.SequenceVariant ), "variant must be a parsed HGVS sequence variant object" # keep a shallow reference to the original variant, to be returned # as-is under certain circumstances orig_var = var if self.validator: self.validator.validate(var) init_met = False if var.posedit is not None and isinstance(var.posedit, hgvs.edit.AARefAlt): init_met = var.posedit.init_met if var.posedit is None or var.posedit.uncertain or init_met or var.posedit.pos is None: return var type = var.type if type == "p": raise HGVSUnsupportedOperationError( "Unsupported normalization of protein level variants: {0}". format(var)) if var.posedit.edit.type == "con": raise HGVSUnsupportedOperationError( "Unsupported normalization of conversion variants: {0}", format(var)) var.fill_ref(self.hdp) if var.posedit.edit.type == "identity": var_norm = copy.deepcopy(var) return var_norm # For c. variants normalization, first convert to n. variant # and perform normalization at the n. level, then convert the # normalized n. variant back to c. variant. if type == "c": var = self.vm.c_to_n(var) if var.type in "nr": if var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0: raise HGVSUnsupportedOperationError( "Normalization of intronic variants is not supported") def is_valid_pos(ac, pos): # tests whether the sequence position actually exists # This is *way* janky. # TODO: push functionality to hdp which can implement differently # based on capabilities of sequence backend try: s = self.hdp.get_seq(ac, pos - 1, pos) # 0-based! return s != "" except HGVSDataNotAvailableError as e: # Bad Request indicates that we got to NCBI, but the request # was invalid. return "Bad Request" not in str(e) if var.posedit.pos.start.base < 0 or not is_valid_pos( var.ac, var.posedit.pos.end.base): if hgvs.global_config.mapping.strict_bounds: raise HGVSInvalidVariantError( f"{var}: coordinates are out-of-bounds") _logger.warning( f"{var}: coordinates are out-of-bounds; returning as-is") return orig_var # restrict var types to those that use sequence start (i.e., not c.) assert var.type in "gmnr", "Internal Error: variant must be of type g, m, n, r" bound_s, bound_e = self._get_boundary(var) boundary = (bound_s, bound_e) start, end, (ref, alt) = self._normalize_alleles(var, boundary) ref_len = len(ref) alt_len = len(alt) # Generate normalized variant if alt_len == ref_len: ref_start = start ref_end = end - 1 # inversion if ref_len > 1 and ref == reverse_complement(alt): edit = hgvs.edit.Inv(ref=ref) # ident elif ref_len == 0 and alt_len == 0: ref_start = ref_end edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # substitution or delins else: edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) if alt_len < ref_len: # del or delins ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=None if alt_len == 0 else alt) elif alt_len > ref_len: # ins or dup if ref_len == 0: if self.shuffle_direction == 3: adj_seq = self._fetch_bounded_seq(var, start - alt_len - 1, end - 1, 0, boundary) else: adj_seq = self._fetch_bounded_seq(var, start - 1, start + alt_len - 1, 0, boundary) # ins if alt != adj_seq: ref_start = start - 1 ref_end = end edit = hgvs.edit.NARefAlt(ref=None, alt=alt) # dup else: if self.shuffle_direction == 3: ref_start = start - alt_len ref_end = end - 1 edit = hgvs.edit.Dup(ref=alt) else: ref_start = start ref_end = start + alt_len - 1 edit = hgvs.edit.Dup(ref=alt) # delins else: ref_start = start ref_end = end - 1 edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) # ensure the start is not 0 if ref_start == 0: ref = self._fetch_bounded_seq(var, 0, 1, 0, boundary) alt = alt + ref edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = 1 ref_end = 1 # ensure the end is not outside of reference sequence tgt_len = self._get_tgt_length(var) if ref_end == tgt_len + 1: ref = self._fetch_bounded_seq(var, tgt_len - 1, tgt_len, 0, boundary) alt = ref + alt edit = hgvs.edit.NARefAlt(ref=ref, alt=alt) ref_start = tgt_len ref_end = tgt_len var_norm = copy.deepcopy(var) var_norm.posedit.edit = edit var_norm.posedit.pos.start.base = ref_start var_norm.posedit.pos.end.base = ref_end if type == "c": var_norm = self.vm.n_to_c(var_norm) return var_norm