def test_reference_sequence_key_hash_and_equality_different_objects():
    rcsk1 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")
    rcsk_different_strand = ReferenceCodingSequenceKey(
        strand="+",
        sequence_before_variant_locus="GAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAG",
        offset_to_first_complete_codon=0,
        contains_start_codon=False,
        overlaps_start_codon=False,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")

    assert rcsk1 != rcsk_different_strand
    assert str(rcsk1) != str(rcsk_different_strand)
    assert repr(rcsk1) != repr(rcsk_different_strand)
    assert hash(rcsk1) != hash(rcsk_different_strand)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M")
    eq_(result, expected)
def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="ATG",
                                          sequence_at_variant_locus="G",
                                          sequence_after_variant_locus="AGG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=True,
                                          overlaps_start_codon=True,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="M")
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3)
    eq_(expected, reference_coding_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3)

    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="GAG",
                                          sequence_at_variant_locus="",
                                          sequence_after_variant_locus="GAG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=False,
                                          overlaps_start_codon=False,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="E")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(
):
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME")
    eq_(result, expected)
def test_reference_sequence_key_hash_and_equality_same_objects():
    rcsk1 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")
    rcsk2 = ReferenceCodingSequenceKey(strand="-",
                                       sequence_before_variant_locus="GAG",
                                       sequence_at_variant_locus="",
                                       sequence_after_variant_locus="GAG",
                                       offset_to_first_complete_codon=0,
                                       contains_start_codon=False,
                                       overlaps_start_codon=False,
                                       contains_five_prime_utr=False,
                                       amino_acids_before_variant="E")

    eq_(rcsk1, rcsk2)
    eq_(str(rcsk1), str(rcsk2))
    eq_(repr(rcsk1), repr(rcsk2))
    eq_(hash(rcsk1), hash(rcsk2))