Esempio n. 1
0
def test_mitochondrial_MTND5_translation_from_cdna():
    mtnd5_001 = ensembl_grch38.transcripts_by_name("MT-ND5-201")[0]
    cdna = mtnd5_001.coding_sequence
    amino_acids, ends_with_stop_codon = translate_cdna(
        cdna, first_codon_is_start=True, mitochondrial=True)
    assert ends_with_stop_codon
    eq_(amino_acids, mtnd5_001.protein_sequence)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M")
    eq_(result, expected)
def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand():
    # insert 'CCC' after start codon of TP53-001, which on the reverse
    # complement means inserting "GGG" between "CTC_CAT"
    tp53_insertion = Variant(
        "17", 7676589, "CTC", "CTCGGG", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # The above gives us the cDNA sequence from the transcript, whereas the
    # reverse complement genomic sequence is:
    #    GCGGCTCCTC_CAT_GGCAGTGACC

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion,
        transcript=tp53_001,
        context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_insertion_reverse_strand():
    # insert 'CCC' after start codon of TP53-001, which on the reverse
    # complement means inserting "GGG" between "CTC_CAT"
    tp53_insertion = Variant(
        "17", 7676589, "CTC", "CTCGGG", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # The above gives us the cDNA sequence from the transcript, whereas the
    # reverse complement genomic sequence is:
    #    GCGGCTCCTC_CAT_GGCAGTGACC

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion,
        transcript=tp53_001,
        context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="GAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAG",
        offset_to_first_complete_codon=0,
        contains_start_codon=False,
        overlaps_start_codon=False,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10
    )
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(
):
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10
    )

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME",
    )
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3)

    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="GAG",
                                          sequence_at_variant_locus="",
                                          sequence_after_variant_locus="GAG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=False,
                                          overlaps_start_codon=False,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="E")
    eq_(result, expected)
Esempio n. 12
0
def test_TP53_translation_from_cdna():
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    cdna = tp53_001.coding_sequence
    amino_acids, ends_with_stop_codon = translate_cdna(
        cdna, first_codon_is_start=True)
    assert ends_with_stop_codon
    eq_(amino_acids, tp53_001.protein_sequence)
Esempio n. 13
0
def test_interbase_range_for_brca2_utr_insertion():
    # T>TC insertion after the 6th nucleotide of BRCA2-001's 5' UTR
    brca2_insertion = Variant("13", 32315479, "T", "TC", ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    interbase_range = interbase_range_affected_by_variant_on_transcript(
        variant=brca2_insertion, transcript=brca2_001)
    print(interbase_range)
    eq_(interbase_range, (6, 6))
Esempio n. 14
0
def test_mitochondrial_MTND5_translation_from_cdna():
    mtnd5_001 = ensembl_grch38.transcripts_by_name("MT-ND5-201")[0]
    cdna = mtnd5_001.coding_sequence
    amino_acids, ends_with_stop_codon = translate_cdna(
        cdna,
        first_codon_is_start=True,
        mitochondrial=True)
    assert ends_with_stop_codon
    eq_(amino_acids, mtnd5_001.protein_sequence)
Esempio n. 15
0
def test_interbase_range_for_brca2_utr_insertion():
    # T>TC insertion after the 6th nucleotide of BRCA2-001's 5' UTR
    brca2_insertion = Variant("13", 32315479, "T", "TC", ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    interbase_range = interbase_range_affected_by_variant_on_transcript(
        variant=brca2_insertion,
        transcript=brca2_001)
    print(interbase_range)
    eq_(interbase_range, (6, 6))
def test_sequence_key_with_reading_frame_insertion_before_start_codon():
    # insert nucleotide "T" before of the start codon of TP53-001,
    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1)
    assert result is None, "Expected result to be None when variant before start codon"
def test_sequence_key_with_reading_frame_insertion_before_start_codon():
    # insert nucleotide "T" before of the start codon of TP53-001,
    tp53_insertion = Variant("17", 7676593, "C", "CT", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1
    )
    assert result is None, "Expected result to be None when variant before start codon"
Esempio n. 18
0
def test_interbase_range_for_brca2_utr_substitution():
    # rs769125639 is a simple T>A substitution in the 6th nucleotide of
    # BRCA2-001's 5' UTR
    brca2_variant_rs769125639 = Variant("13", 32315479, "T", "A",
                                        ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    interbase_range = interbase_range_affected_by_variant_on_transcript(
        variant=brca2_variant_rs769125639, transcript=brca2_001)
    print(interbase_range)
    eq_(interbase_range, (5, 6))
def test_reference_coding_sequence_key_insertion_inside_start_codon():
    # insert nucleotide "C" in the middle of the start codon of TP53-001,
    # keeping only 1 nucleotide of context. In the reverse complement this
    # becomes 'T'>'TG'
    tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1)
    assert result is None, "Expected result to be None when variant affects start codon"
Esempio n. 20
0
def test_interbase_range_for_brca2_utr_substitution():
    # rs769125639 is a simple T>A substitution in the 6th nucleotide of
    # BRCA2-001's 5' UTR
    brca2_variant_rs769125639 = Variant(
        "13", 32315479, "T", "A", ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    interbase_range = interbase_range_affected_by_variant_on_transcript(
        variant=brca2_variant_rs769125639,
        transcript=brca2_001)
    print(interbase_range)
    eq_(interbase_range, (5, 6))
def test_reference_coding_sequence_key_insertion_inside_start_codon():
    # insert nucleotide "C" in the middle of the start codon of TP53-001,
    # keeping only 1 nucleotide of context. In the reverse complement this
    # becomes 'T'>'TG'
    tp53_insertion = Variant("17", 7676592, "T", "TG", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=1
    )
    assert result is None, "Expected result to be None when variant affects start codon"
Esempio n. 22
0
def test_sequence_key_for_variant_on_transcript_deletion():
    # Delete the 6th nucleotide of BRCA2-001's 5' UTR
    brca2_variant_deletion = Variant("13", 32315479, "T", "", ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    # first 50 characters of BRCA2-001:
    #  "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG"
    brca2_ref_seq = brca2_001.sequence[:50]
    eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
    print(brca2_ref_seq)
    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=brca2_variant_deletion, transcript=brca2_001, context_size=10)
    expected_sequence_key = ReferenceSequenceKey(
        strand="+",
        sequence_before_variant_locus=brca2_ref_seq[:5],
        sequence_at_variant_locus="T",
        sequence_after_variant_locus=brca2_ref_seq[6:16])
    eq_(sequence_key, expected_sequence_key)
Esempio n. 23
0
def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand():
    # delete start codon of TP53-001, which in reverse complement means
    # deleting the sequence "CAT"
    tp53_deletion = Variant("17", 7676592, "CAT", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="GGTCACTGCC",
        sequence_at_variant_locus="ATG",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
Esempio n. 24
0
def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand():
    # Replace start codon of TP53-001 with 'CCC', however since this is on
    # reverse strand the variant becomes "CAT">"GGG"
    tp53_substitution = Variant("17", 7676592, "CAT", "GGG", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_substitution, transcript=tp53_001, context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="GGTCACTGCC",
        sequence_at_variant_locus="ATG",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_deletion():
    # Delete the 6th nucleotide of BRCA2-001's 5' UTR
    brca2_variant_deletion = Variant(
        "13", 32315479, "T", "", ensembl_grch38)
    brca2_001 = ensembl_grch38.transcripts_by_name("BRCA2-001")[0]
    # first 50 characters of BRCA2-001:
    #  "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG"
    brca2_ref_seq = brca2_001.sequence[:50]
    eq_(brca2_ref_seq, "GGGCTTGTGGCGCGAGCTTCTGAAACTAGGCGGCAGAGGCGGAGCCGCTG")
    print(brca2_ref_seq)
    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=brca2_variant_deletion,
        transcript=brca2_001,
        context_size=10)
    expected_sequence_key = ReferenceSequenceKey(
        strand="+",
        sequence_before_variant_locus=brca2_ref_seq[:5],
        sequence_at_variant_locus="T",
        sequence_after_variant_locus=brca2_ref_seq[6:16])
    eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_substitution_reverse_strand():
    # Replace start codon of TP53-001 with 'CCC', however since this is on
    # reverse strand the variant becomes "CAT">"GGG"
    tp53_substitution = Variant(
        "17", 7676592, "CAT", "GGG", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_substitution,
        transcript=tp53_001,
        context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="GGTCACTGCC",
        sequence_at_variant_locus="ATG",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
def test_sequence_key_for_variant_on_transcript_deletion_reverse_strand():
    # delete start codon of TP53-001, which in reverse complement means
    # deleting the sequence "CAT"
    tp53_deletion = Variant(
        "17", 7676592, "CAT", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around start codon with 10 context nucleotides:
    # In [51]: t.sequence[190-10:190+13]
    # Out[51]: 'GGTCACTGCC_ATG_GAGGAGCCGC'
    eq_(tp53_001.sequence[190 - 10:190 + 13], "GGTCACTGCCATGGAGGAGCCGC")

    # get the 5 nucleotides before the variant and 10 nucleotides after
    sequence_key = ReferenceSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion,
        transcript=tp53_001,
        context_size=10)

    expected_sequence_key = ReferenceSequenceKey(
        strand="-",
        sequence_before_variant_locus="GGTCACTGCC",
        sequence_at_variant_locus="ATG",
        sequence_after_variant_locus="GAGGAGCCGC")
    eq_(sequence_key, expected_sequence_key)
Esempio n. 28
0
def test_TP53_translation_from_cdna():
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    cdna = tp53_001.coding_sequence
    amino_acids, ends_with_stop_codon = translate_cdna(cdna, first_codon_is_start=True)
    assert ends_with_stop_codon
    eq_(amino_acids, tp53_001.protein_sequence)
Esempio n. 29
0
def test_sequence_key_with_reading_frame_substitution_on_negative_strand():
    # replace second codon of TP53-001 with 'CCC'
    tp53_substitution = Variant(
        "17", 7676589, "CTC", "GGG", ensembl_grch38)
    variant_collection = VariantCollection([tp53_substitution])

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    # first calling without a transcript ID white to see if we get back
    # multiple contexts
    reference_context_dict_many_transcripts = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist=None)

    assert len(reference_context_dict_many_transcripts) == 1, \
        "Dictionary should have only one variant but got %d keys" % (
            len(reference_context_dict_many_transcripts),)

    reference_contexts = reference_context_dict_many_transcripts[tp53_substitution]

    assert len(reference_contexts) > 1, \
        "Expected multiple reference contexts for %s but got %d: %s" % (
            tp53_substitution,
            len(reference_contexts),
            reference_contexts)

    reference_context_dict_single_transcript = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist={tp53_001.id})

    # still only expect one variant key
    eq_(len(reference_context_dict_single_transcript), 1)

    result_list = reference_context_dict_single_transcript[tp53_substitution]

    # since we limited the transcript ID whitelist, we only expect a single
    # reference context in the result
    eq_(len(result_list), 1)

    result = result_list[0]

    expected = ReferenceContext(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M",
        variant=tp53_substitution,
        transcripts=[tp53_001])
    eq_(result, expected)
Esempio n. 30
0
def test_sequence_key_with_reading_frame_substitution_on_negative_strand():
    # replace second codon of TP53-001 with 'CCC'
    tp53_substitution = Variant("17", 7676589, "CTC", "GGG", ensembl_grch38)
    variant_collection = VariantCollection([tp53_substitution])

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    # first calling without a transcript ID white to see if we get back
    # multiple contexts
    reference_context_dict_many_transcripts = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist=None)

    assert len(reference_context_dict_many_transcripts) == 1, \
        "Dictionary should have only one variant but got %d keys" % (
            len(reference_context_dict_many_transcripts),)

    reference_contexts = reference_context_dict_many_transcripts[
        tp53_substitution]

    assert len(reference_contexts) > 1, \
        "Expected multiple reference contexts for %s but got %d: %s" % (
            tp53_substitution,
            len(reference_contexts),
            reference_contexts)

    reference_context_dict_single_transcript = \
        reference_contexts_for_variants(
            variants=variant_collection,
            context_size=10,
            transcript_id_whitelist={tp53_001.id})

    # still only expect one variant key
    eq_(len(reference_context_dict_single_transcript), 1)

    result_list = reference_context_dict_single_transcript[tp53_substitution]

    # since we limited the transcript ID whitelist, we only expect a single
    # reference context in the result
    eq_(len(result_list), 1)

    result = result_list[0]

    expected = ReferenceContext(strand="-",
                                sequence_before_variant_locus="CACTGCCATG",
                                sequence_at_variant_locus="GAG",
                                sequence_after_variant_locus="GAGCCGCAGT",
                                offset_to_first_complete_codon=7,
                                contains_start_codon=True,
                                overlaps_start_codon=True,
                                contains_five_prime_utr=True,
                                amino_acids_before_variant="M",
                                variant=tp53_substitution,
                                transcripts=[tp53_001])
    eq_(result, expected)
Esempio n. 31
0
def test_protein_protein_sequence():
    transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0]
    eq_(transcript.protein_sequence, EGFR_001_protein_sequence)
Esempio n. 32
0
def test_protein_id():
    transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0]
    eq_(transcript.protein_id, "ENSP00000275493")
def test_protein_id():
    transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0]
    eq_(transcript.protein_id, "ENSP00000275493")
def test_protein_protein_sequence():
    transcript = ensembl_grch38.transcripts_by_name("EGFR-001")[0]
    eq_(transcript.protein_sequence, EGFR_001_protein_sequence)