def test_repr(self) -> None: position_strings = ( "8", "92380", "*8", "-80", "122-6", "78+10", "*89+67", "-127+6", "*73-105", "-45-1", "Cys234", "Ala9", ) for s in position_strings: with self.subTest(s=s): v = VariantPosition(s) self.assertEqual(s, repr(v))
def _target_validate_substitution(pos: VariantPosition, ref: str, target: str) -> None: """Determine whether the target portion of a substitution matches the target sequence. Note that variants using extended syntax cannot be validated with this method. If an extended syntax variant is encountered, it will be interpreted as valid/matching. Parameters ---------- pos : VariantPosition Position of the substitution. ref : str Reference base or amino acid from the variant. target : str Target sequence. This must be an amino acid sequence for protein variants or a nucleotide sequence for coding/noncoding/genomic variants. RNA sequences should be in lowercase, DNA sequences should be in uppercase. Returns ------- None Raises ------ MaveHgvsParseError If the reference base or amino acid does not match the target at the given position MaveHgvsParseError If the position is outside the bounds of the target. """ if pos.is_extended(): return elif pos.position > len(target): raise MaveHgvsParseError("variant coordinate out of bounds") elif target[pos.position - 1] != ref: raise MaveHgvsParseError( "substitution reference does not match target") else: return
def test_non_adjacent_pairs(self) -> None: position_strings = ( "-45-1", "-12", "8", "99", "99+88", "99+122", "100-12", "103", "202-12", "202-1", "205", "*1", "*12", "*73-105", ) variants = [VariantPosition(s) for s in position_strings] for v1, v2 in itertools.permutations(variants, 2): with self.subTest(v1=v1, v2=v2): self.assertFalse(v1.is_adjacent(v2))
def test_not_adjacent_to_self(self) -> None: position_strings = ( "-45-1", "-12", "8", "99", "99+88", "99+122", "100-12", "100", "103", "202-12", "202-1", "205", "*1", "*12", "*73-105", ) variants = [VariantPosition(s) for s in position_strings] for v in variants: with self.subTest(v=v): self.assertFalse(v.is_adjacent(v))
def test_invalid_strings(self) -> None: position_strings = ( "08", "+12", "*-99", "A", "TCGA", "g", "*", "-", "+", "**6", "800 + 12", "-12*5", "Glu-12", "*5Trp", "Xyz12", "ALA12", ) for s in position_strings: with self.subTest(s=s): with self.assertRaises(MaveHgvsParseError): VariantPosition(s)
def setUp(self) -> None: sorted_position_strings = ( "-45-1", "-12", "8", "99", "99+88", "99+122", "100-12", "100", "101", "202-12", "202-1", "202", "*1", "*73-105", ) self.sorted_variants = [VariantPosition(p) for p in sorted_position_strings] # pairwise itertools recipe a, b = itertools.tee(self.sorted_variants) next(b, None) self.sorted_variant_pairs = zip(a, b)
def test_adjacent_pairs(self) -> None: adjacent_pairs = ( ("-45-2", "-45-1"), ("-45-1", "-45"), ("-12", "-13"), ("-1", "1"), ("8", "9"), ("202-1", "202"), ("99", "99+1"), ("99+88", "99+89"), ("100-12", "100-11"), ("100", "101"), ("*1", "*2"), ("*73-1", "*73"), ) for s1, s2 in adjacent_pairs: v1 = VariantPosition(s1) v2 = VariantPosition(s2) with self.subTest(v1=v1, v2=v2): self.assertTrue(v1.is_adjacent(v2)) with self.subTest(v1=v1, v2=v2): self.assertTrue(v2.is_adjacent(v1))
def _process_string_variant( self, match_dict: Dict[str, str], relaxed_ordering: bool ) -> Tuple[str, Optional[Union[VariantPosition, Tuple[ VariantPosition, VariantPosition]]], Optional[Union[str, Tuple[ str, str]]], ]: """Process the match dictionary from a single variant into its components. Parameters ---------- match_dict : Dict[str, str] Match dictionary from the MAVE-HGVS regular expression. relaxed_ordering : bool If True, variants that do not observe the 3-prime rule for variant position ordering are allowed. Returns ------- Tuple[str, Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]], Optional[Union[str, Tuple[str, str]]]] Returns a 3-tuple containing the variant type, optional position (or start/end positions), and optional before/after substitution sequences or inserted sequence. """ variant_type = None positions = None sequences = None # determine which named groups to check if self._prefix == "p": pattern_group_tuples = [(f"pro_{t}", t) for t in self.VTYPES] elif self._prefix == "r": pattern_group_tuples = [(f"rna_{t}", t) for t in self.VTYPES] elif self._prefix in tuple("cn"): pattern_group_tuples = [(f"dna_{t}_{self._prefix}", t) for t in self.VTYPES] elif self._prefix in tuple("gmo"): pattern_group_tuples = [(f"dna_{t}_gmo", t) for t in self.VTYPES] else: # pragma: no cover raise ValueError("unexpected prefix") # set the variant type vtype_set = False pattern_group = None for pg, vtype in pattern_group_tuples: if match_dict[pg] is not None: if vtype_set: # pragma: no cover raise ValueError( f"ambiguous match: '{pg}' and '{pattern_group}'") variant_type = vtype pattern_group = pg vtype_set = True # set the position and sequence if variant_type == "sub": positions = VariantPosition( match_dict[f"{pattern_group}_position"]) if self._prefix == "p": sequences = (positions.amino_acid, match_dict[f"{pattern_group}_new"]) elif self._prefix in tuple("gmo" "cn" "r"): sequences = ( match_dict[f"{pattern_group}_ref"], match_dict[f"{pattern_group}_new"], ) else: # pragma: no cover raise ValueError("unexpected prefix") elif variant_type in ("del", "dup", "ins", "delins", "equal"): # set position if (match_dict.get(f"{pattern_group}_position") is not None ): # use get() since ins pattern doesn't have pos positions = VariantPosition( match_dict[f"{pattern_group}_position"]) elif (match_dict.get(f"{pattern_group}_start") is not None and match_dict.get(f"{pattern_group}_end") is not None): positions = ( VariantPosition(match_dict[f"{pattern_group}_start"]), VariantPosition(match_dict[f"{pattern_group}_end"]), ) # extra validation on positions if positions[0] >= positions[1]: if relaxed_ordering: positions = (positions[1], positions[0]) else: raise MaveHgvsParseError( "start position must be before end position") if variant_type == "ins": if not positions[0].is_adjacent(positions[1]): raise MaveHgvsParseError( "insertion positions must be adjacent") else: # pragma: no cover if variant_type != "equal": raise MaveHgvsParseError("variant position not found") # set sequence if needed if variant_type in ("ins", "delins"): sequences = match_dict[f"{pattern_group}_seq"] elif variant_type == "equal": if (match_dict[f"{pattern_group}_equal"] is not None): # special case for target identity sequences = match_dict[f"{pattern_group}_equal"] elif match_dict[f"pro_equal_equal_sy"] is not None: sequences = match_dict[f"pro_equal_equal_sy"] return variant_type, positions, sequences
def test_intron(self) -> None: v = VariantPosition("122-6") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (122, None, -6, None), ) self.assertFalse(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended()) v = VariantPosition("78+10") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (78, None, 10, None) ) self.assertFalse(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended())
def test_position_only(self) -> None: v = VariantPosition("8") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (8, None, None, None), ) self.assertFalse(v.is_utr()) self.assertFalse(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertFalse(v.is_extended()) v = VariantPosition("92380") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (92380, None, None, None), ) self.assertFalse(v.is_utr()) self.assertFalse(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertFalse(v.is_extended())
def test_amino_acid(self) -> None: v = VariantPosition("Gly8") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (8, "Gly", None, None), ) self.assertFalse(v.is_utr()) self.assertFalse(v.is_intronic()) self.assertTrue(v.is_protein()) self.assertFalse(v.is_extended()) v = VariantPosition("Cys92380") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (92380, "Cys", None, None), ) self.assertFalse(v.is_utr()) self.assertFalse(v.is_intronic()) self.assertTrue(v.is_protein()) self.assertFalse(v.is_extended())
def test_utr_intron(self) -> None: v = VariantPosition("*89+67") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (89, None, 67, True) ) self.assertTrue(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended()) v = VariantPosition("-127+6") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (-127, None, 6, True), ) self.assertTrue(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended()) v = VariantPosition("*73-105") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (73, None, -105, True), ) self.assertTrue(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended()) v = VariantPosition("-45-1") self.assertTupleEqual( (v.position, v.amino_acid, v.intronic_position, v.utr), (-45, None, -1, True), ) self.assertTrue(v.is_utr()) self.assertTrue(v.is_intronic()) self.assertFalse(v.is_protein()) self.assertTrue(v.is_extended())
def test_position(self): variant_tuples = [ (VariantPosition("Glu27"), "p.Glu27Trp"), (VariantPosition("122-6"), "c.122-6T>A"), (VariantPosition("44"), "g.44del"), ((VariantPosition("78+5"), VariantPosition("78+10")), "c.78+5_78+10del"), (VariantPosition("77"), "c.77dup"), ((VariantPosition("Pro12"), VariantPosition("Gly18")), "p.Pro12_Gly18dup"), ( (VariantPosition("Ala12"), VariantPosition("Pro13")), "p.Ala12_Pro13insGlyProCys", ), ((VariantPosition("22"), VariantPosition("23")), "r.22_23insauc"), ( (VariantPosition("43-6"), VariantPosition("595+12")), "c.43-6_595+12delinsCTT", ), ( (VariantPosition("Ile71"), VariantPosition("Cys80")), "p.Ile71_Cys80delinsSer", ), ] for p, s in variant_tuples: with self.subTest(p=p, s=s): v = Variant(s) if isinstance(p, list): # multi-variant self.assertEqual(len(p), len(v.positions)) for q, vp in zip(p, v.positions): if isinstance(q, tuple): self.assertTupleEqual(q, vp) else: self.assertEqual(q, vp) if isinstance(p, tuple): self.assertTupleEqual(p, v.positions) else: self.assertEqual(p, v.positions)