def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="ATG",
        sequence_at_variant_locus="G",
        sequence_after_variant_locus="AGG",
        offset_to_first_complete_codon=0,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="M",
    )
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3
    )
    eq_(expected, reference_coding_sequence_key)
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position,
                                 dna_ref, dna_alt, aa_pos, aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
        # exonic splice site mutations carry with them an alternate effect
        # which is what we check against dbNSFP (since that database seemed
        # to ignore exonic splicing mutations)
        effect = effect.alternate_effect

    assert isinstance(effect, Substitution), \
        "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
            aa_pos, aa_alt, effect)
    effect_aa_pos = effect.aa_mutation_start_offset
    effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
    assert (
        effect_aa_pos + 1 == aa_pos and
        effect_aa_alt == aa_alt), \
            "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
                aa_alt,
                aa_pos,
                chrom,
                dna_position,
                dna_ref,
                dna_alt,
                effect)
Exemple #3
0
def test_specific_variant_mouse_with_ensembl_genome():
    # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
    # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
    variant = Variant(
        contig=11,
        start=101177240,
        ref="G",
        alt="T",
        ensembl=ensembl_mouse_genome)
    effects = variant.effects()
    eq_(len(effects), 2)
    substitution_effects = [
        effect
        for effect in effects
        if isinstance(effect, Substitution)
    ]
    eq_(len(substitution_effects), 1)
    substitution_effect = substitution_effects[0]
    # The coding sequence through the sub:
    # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
    # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
    # (The final G is the sub: the 77th nucleotide)
    # TGC (C) -> TTC (F)
    # 78 / 3 = 26
    # 0-base = 25
    eq_(substitution_effect.mutant_protein_sequence[25], "F")
    eq_(substitution_effect.original_protein_sequence[25], "C")
Exemple #4
0
def test_specific_variant_mouse_with_ensembl_genome():
    # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
    # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
    variant = Variant(
        contig=11,
        start=101177240,
        ref="G",
        alt="T",
        ensembl=ensembl_mouse_genome)
    effects = variant.effects()
    eq_(len(effects), 2)
    substitution_effects = [
        effect
        for effect in effects
        if isinstance(effect, Substitution)
    ]
    eq_(len(substitution_effects), 1)
    substitution_effect = substitution_effects[0]
    # The coding sequence through the sub:
    # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
    # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
    # (The final G is the sub: the 77th nucleotide)
    # TGC (C) -> TTC (F)
    # 78 / 3 = 26
    # 0-base = 25
    eq_(substitution_effect.mutant_protein_sequence[25], "F")
    eq_(substitution_effect.original_protein_sequence[25], "C")
Exemple #5
0
def generate_random_missense_variants(num_variants=10,
                                      max_search=100000,
                                      reference="GRCh37"):
    """
    Generate a random collection of missense variants by trying random variants repeatedly.
    """
    variants = []
    for i in range(max_search):
        bases = ["A", "C", "T", "G"]
        random_ref = choice(bases)
        bases.remove(random_ref)
        random_alt = choice(bases)
        random_contig = choice(["1", "2", "3", "4", "5"])
        random_variant = Variant(contig=random_contig,
                                 start=randint(1, 1000000),
                                 ref=random_ref,
                                 alt=random_alt,
                                 ensembl=reference)
        try:
            effects = random_variant.effects()
            for effect in effects:
                if isinstance(effect, Substitution):
                    variants.append(random_variant)
                    break
        except:
            continue
        if len(variants) == num_variants:
            break
    return VariantCollection(variants)
def test_reference_coding_sequence_key_around_TP53_201_variant():
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")
    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="ATG",
                                          sequence_at_variant_locus="G",
                                          sequence_after_variant_locus="AGG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=True,
                                          overlaps_start_codon=True,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="M")
    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant, transcript=transcript, context_size=3)
    eq_(expected, reference_coding_sequence_key)
Exemple #7
0
def test_serialization():
    variants = [
        Variant(1, start=10, ref="AA", alt="AAT", ensembl=ensembl77),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        assert original == reconstituted

        assert original.contig == reconstituted.contig
        assert original.ref == reconstituted.ref
        assert original.alt == reconstituted.alt
        assert original.start == reconstituted.start
        assert original.end == reconstituted.end

        # Test json.
        serialized = original.to_json()
        reconstituted = Variant.from_json(serialized)
        assert original == reconstituted
Exemple #8
0
def test_serialization():
    original = VariantCollection([
            Variant(
                1, start=10, ref="AA", alt="AAT", ensembl=77),
            Variant(10, start=15, ref="A", alt="G"),
            Variant(20, start=150, ref="", alt="G"),
    ])
    original.metadata[original[0]] = {"a": "b"}
    original.metadata[original[2]] = {"bar": 2}

    # This causes the variants' ensembl objects to make a SQL connection,
    # which makes the ensembl object non-serializable. By calling this
    # method, we are checking that we don't attempt to directly serialize
    # the ensembl object.
    original.effects()

    # Test pickling.
    serialized = pickle.dumps(original)
    reconstituted = pickle.loads(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])

    # Test json.
    serialized = original.to_json()
    reconstituted = VariantCollection.from_json(serialized)
    eq_(original, reconstituted)
    eq_(reconstituted[0], original[0])
    eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
Exemple #9
0
def test_serialization():
    variants = [
        Variant(
            1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        eq_(original, reconstituted)

        eq_(original.contig, reconstituted.contig)
        eq_(original.ref, reconstituted.ref)
        eq_(original.alt, reconstituted.alt)
        eq_(original.start, reconstituted.start)
        eq_(original.end, reconstituted.end)
        eq_(original.original_ref, reconstituted.original_ref)
        eq_(original.original_alt, reconstituted.original_alt)
        eq_(original.original_start, reconstituted.original_start)

        # Test json.
        serialized = original.to_json()
        reconstituted = Variant.from_json(serialized)
        eq_(original, reconstituted)
Exemple #10
0
def test_drop_duplicates():
    ensembl = EnsemblRelease(78)
    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
    collection_without_duplicates = VariantCollection(
        variants=[v1, v1, v1_copy, v2])
    assert len(collection_without_duplicates) == 2
Exemple #11
0
def test_contig_name_normalization():
    eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1")
    eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1)

    # uppercase
    eq_(Variant(
        "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM")
    eq_(Variant(
        "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
Exemple #12
0
def test_multiple_alleles_per_line():
    variants = load_vcf(data_path("multiallelic.vcf"))
    assert len(variants) == 2, "Expected 2 variants but got %s" % variants
    variant_list = list(variants)
    expected_variants = [
        Variant(1, 1431105, "A", "C", genome="GRCh37"),
        Variant(1, 1431105, "A", "G", genome="GRCh37"),
    ]
    eq_(set(variant_list), set(expected_variants))
def test_STAT1_stop_gain_at_exon_boundary():
    # top priority effect for this variant should be PrematureStop,
    # even though it's also ExonicSpliceSite
    stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
    effects = stat1_variant.effects()
    print(effects)
    assert any([e.__class__ is ExonicSpliceSite for e in effects])
    top_effect = effects.top_priority_effect()
    print(top_effect)
    assert top_effect.__class__ is PrematureStop
def test_silent_stop_codons():
    silent_stop_codon_variants = {
        "ENST00000290524":
        Variant(1, start=151314663, ref="C", alt="T", genome=ensembl_grch37),
        "ENST00000368725":
        Variant(1, start=153409535, ref="C", alt="T", genome=ensembl_grch37),
        "ENST00000353479":
        Variant(10, start=105791994, ref="C", alt="T", genome=ensembl_grch37),
    }
    for transcript_id, variant in silent_stop_codon_variants.items():
        yield (expect_effect, variant, transcript_id, Silent)
def test_HRAS_G13V_in_cancer_driver_genes_and_variants():
    HRAS_G13V = Variant("11", 534285, "C", "A", "GRCh37")
    effect = HRAS_G13V.effects().top_priority_effect()
    eq_(effect.gene.name, "HRAS")
    eq_(effect.short_description, "p.G13V")
    gene_pathway_check = GenePathwayCheck()
    variant_info = gene_pathway_check.make_variant_dict(HRAS_G13V)
    assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME]
    assert not variant_info[_CLASS_I_MHC_COLUMN_NAME]
    assert variant_info[_DRIVER_VARIANT_COLUMN_NAME]
    assert variant_info[_DRIVER_GENE_COLUMN_NAME]
Exemple #16
0
def test_snv_transition_transversion():
    ref_variant = Variant(1, start=100, ref="C", alt="C")
    assert not ref_variant.is_snv

    variant = Variant(1, start=100, ref="C", alt="T")
    assert variant.is_snv
    assert variant.is_transition
    assert not variant.is_transversion

    transversion = Variant(1, start=100, ref="C", alt="A")
    assert transversion.is_snv
    assert not transversion.is_transition
    assert transversion.is_transversion
def test_HRAS_G13C_in_cancer_driver_genes():
    HRAS_G13C = Variant("11", 534286, "C", "A", "GRCh37")
    effect = HRAS_G13C.effects().top_priority_effect()
    eq_(effect.gene.name, "HRAS")
    eq_(effect.short_description, "p.G13C")
    gene_pathway_check = GenePathwayCheck()
    variant_info = gene_pathway_check.make_variant_dict(HRAS_G13C)
    assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME]
    assert not variant_info[_CLASS_I_MHC_COLUMN_NAME]
    # even though it's a RAS G13 variant, it's not actually that common
    # and thus didn't make the threshold for our source dataset
    assert not variant_info[_DRIVER_VARIANT_COLUMN_NAME]
    assert variant_info[_DRIVER_GENE_COLUMN_NAME]
def test_maf():
    expected_tcga_ov_variants = [
        Variant(1, 1650797, "A", "G", ensembl),
        Variant(1, 23836447, "C", "A", ensembl),
        Variant(1, 231401797, "A", "C", ensembl),
        Variant(11, 124617502, "C", "G", ensembl),
    ]
    eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants))
    for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants):
        eq_(v_expect, v_maf)
        gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol']
        assert any(gene.name == gene_name for gene in v_maf.genes), \
            "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
    effects = variant.effects()
    transcript_dict = effects.top_priority_effect_per_transcript_id()
    assert transcript_id in transcript_dict, \
        "Expected transcript ID %s for variant %s not found in %s" % (
            transcript_id, variant, transcript_dict)
    effect = transcript_dict[transcript_id]

    # COSMIC seems to ignore exonic splice sites
    if isinstance(effect, ExonicSpliceSite):
        return effect.alternate_effect
    else:
        return effect
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
    effects = variant.effects()
    transcript_dict = effects.top_priority_effect_per_transcript_id()
    assert transcript_id in transcript_dict, \
        "Expected transcript ID %s for variant %s not found in %s" % (
            transcript_id, variant, transcript_dict)
    effect = transcript_dict[transcript_id]

    # COSMIC seems to ignore exonic splice sites
    if isinstance(effect, ExonicSpliceSite):
        return effect.alternate_effect
    else:
        return effect
Exemple #21
0
def test_multiple_variant_forms():
    """
    Load VCF, MAF and VariantCollection together.
    """
    vcf_dir, cohort = None, None
    try:
        vcf_dir, cohort = make_cohort([FILE_FORMAT_1])
        patient = cohort[0]
        patient.variants.append(data_path(MAF_FILE))
        # Make sure listing the file twice has no effect.
        patient.variants.append(data_path(MAF_FILE))
        variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75)
        patient.variants.append(VariantCollection([variant]))

        cohort_variants = cohort.load_variants(patients=[patient])

        # Make sure the VariantCollection was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1)

        # Make sure the VCF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1)

        # Make sure the MAF was included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1)

        # Make sure a non-existant variant is not included.
        eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0)
    finally:
        if vcf_dir is not None and path.exists(vcf_dir):
            rmtree(vcf_dir)
        if cohort is not None:
            cohort.clear_caches()
def test_frameshift_near_start_of_BRCA1_001():
    #
    # Insertion of genomic "A" after second codon of coding sequence.
    #
    # Transcript: BRCA1-001 (ENST00000357654)
    # Manually annotated using Ensembl release 85
    #
    # Original mRNA coding sequnce:
    #   ATG GAT TTA TCT GCT CTT CGC GTT GAA GAA GTA CAA
    #   -M- -D- -L- -S- -A- -L- -A- -V- -E- -E- -V- -Q-
    #
    # After variant:
    #   ATG GAT TTT ATC TGC TCT TCG CGT TGA
    #   -M- -D- -F- -I- -C- -S- -S- -R-  *
    variant = Variant("17",
                      43124096 - 6,
                      ref="",
                      alt="A",
                      ensembl=ensembl_grch38)
    expect_effect(variant,
                  transcript_id="ENST00000357654",
                  effect_class=FrameShift,
                  modifies_coding_sequence=True,
                  modifies_protein_sequence=True,
                  aa_alt="FICSSR")
Exemple #23
0
def test_allele_count_dataframe():
    variant = Variant("test_contig", 50, "C", "G")
    read_evidence = ReadEvidence(trimmed_base1_start=50,
                                 trimmed_ref="C",
                                 trimmed_alt="G",
                                 ref_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="C",
                                                suffix="TTT",
                                                name="C1"),
                                     AlleleRead(prefix="AAC",
                                                allele="C",
                                                suffix="TTA",
                                                name="C2"),
                                 ],
                                 alt_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="G",
                                                suffix="TTT",
                                                name="G1")
                                 ],
                                 other_reads=[])
    df = allele_counts_dataframe([(variant, read_evidence)])
    assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, )
    row = df.iloc[0]
    eq_(row.num_ref_reads, 2)
    eq_(row.num_alt_reads, 1)
    eq_(row.num_other_reads, 0)
Exemple #24
0
def test_locus_reads_substitution_longer():
    # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG",
    # the alignment is interpreted as a C>G variant followed by an insertion of
    # another G
    variant = Variant("chromosome",
                      2,
                      ref="C",
                      alt="GG",
                      normalize_contig_name=False)
    print(variant)
    pysam_read = make_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=1,
                             base1_position_after_variant=3))
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, None, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=0,
                         base0_read_position_after_variant=3)
    assert_equal_fields(read, expected)
Exemple #25
0
def test_locus_reads_substitution_shorter():
    # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence
    # "ACCTTG", for which the alignment is interpreted as a C>G variant
    # followed by the deletion of a C
    variant = Variant("chromosome",
                      2,
                      ref="CC",
                      alt="G",
                      normalize_contig_name=False)
    print(variant)
    pysam_read = make_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=1,
                             base1_position_after_variant=4))
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    print(reads)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=0,
                         base0_read_position_after_variant=2)
    assert_equal_fields(read, expected)
Exemple #26
0
def test_serialization():
    variants = [
        Variant(
            1, start=10, ref="AA", alt="AAT", ensembl=ensembl_grch38),
        Variant(10, start=15, ref="A", alt="G"),
        Variant(20, start=150, ref="", alt="G"),
    ]
    for original in variants:
        # This causes the variant's ensembl object to make a SQL connection,
        # which makes the ensembl object non-serializable. By calling this
        # method, we are checking that we don't attempt to directly serialize
        # the ensembl object.
        original.effects()

        # Test pickling.
        serialized = pickle.dumps(original)
        reconstituted = pickle.loads(serialized)
        eq_(original, reconstituted)

        eq_(original.contig, reconstituted.contig)
        eq_(original.ref, reconstituted.ref)
        eq_(original.alt, reconstituted.alt)
        eq_(original.start, reconstituted.start)
        eq_(original.end, reconstituted.end)
        eq_(original.original_ref, reconstituted.original_ref)
        eq_(original.original_alt, reconstituted.original_alt)
        eq_(original.original_start, reconstituted.original_start)

        # Test json.
        serialized = original.to_json()
        reconstituted = Variant.from_json(serialized)
        eq_(original, reconstituted)
Exemple #27
0
def test_locus_reads_snv():
    """
    test_partitioned_read_sequences_snv : Test that read gets correctly
    partitioned for chr1:4 T>G where the sequence for chr1 is assumed
    to be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    variant = Variant("chromosome",
                      4,
                      ref="T",
                      alt="G",
                      normalize_contig_name=False)

    pysam_read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2")

    samfile = DummySamFile(reads=[pysam_read])
    reads = list(
        locus_read_generator(samfile=samfile,
                             chromosome="chromosome",
                             base1_position_before_variant=variant.start - 1,
                             base1_position_after_variant=variant.start + 1))
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         base0_read_position_before_variant=2,
                         base0_read_position_after_variant=4)
    assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr():
    # Delete second codon of TP53-001, the surrounding context
    # includes nucleotides from the 5' UTR. Since TP53 is on the negative
    # strand we have to take the reverse complement of the variant which turns
    # it into 'CTC'>''
    tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38)
    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]

    # Sequence of TP53 around second codon with 10 context nucleotides:
    # In [51]: t.sequence[193-10:193+13]
    # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT'
    # Which can be split into the following parts:
    #  last 7 nt of 5' UTR: CACTGCC
    #  start codon: ATG (translates to M)
    #  2nd codon: GAG    <---- variant occurs here
    #  3rd codon: GAG
    #  4th codon: CCG
    #  5th codon:  CAG
    #  first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_deletion, transcript=tp53_001, context_size=10)
    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="CACTGCCATG",
        sequence_at_variant_locus="GAG",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=7,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="M")
    eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start(
):
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # a part of the start codon, thus the result shouldn't "contain"
    # the start codon but does "overlap" it. In the reverse complement
    # this variant becomes CTC>CTCA
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   last two nt of start codon: TG
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   first two nt of 4th codon: CC

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=5)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCC",
        offset_to_first_complete_codon=2,
        contains_start_codon=False,
        overlaps_start_codon=True,
        contains_five_prime_utr=False,
        amino_acids_before_variant="E")
    eq_(result, expected)
Exemple #30
0
def test_group_unique_sequences():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref, alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    print("%d variant reads: %s" % (
        len(variant_reads), variant_reads))
    groups = group_unique_sequences(
        variant_reads,
        max_prefix_size=30,
        max_suffix_size=30)
    print("%d unique sequences: %s" % (
        len(groups), groups))
    # there are some redundant reads, so we expect that the number of
    # unique entries should be less than the total read partitions
    assert len(variant_reads) > len(groups)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start():
    # Insert nucleotide "T" after second codon of TP53-001,
    # but in this test we're going to only keep enough context to see
    # the second codon (and no nucleotides from the start). In the reverse
    # complement this variant becomes CTC>CTCA.

    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 6 context nucleotides:
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=3)

    expected = ReferenceCodingSequenceKey(strand="-",
                                          sequence_before_variant_locus="GAG",
                                          sequence_at_variant_locus="",
                                          sequence_after_variant_locus="GAG",
                                          offset_to_first_complete_codon=0,
                                          contains_start_codon=False,
                                          overlaps_start_codon=False,
                                          contains_five_prime_utr=False,
                                          amino_acids_before_variant="E")
    eq_(result, expected)
Exemple #32
0
def test_mhc_predictor_error():
    genome = EnsemblRelease(species="mouse")
    wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0]

    protein_fragment = MutantProteinFragment(
        variant=Variant('X', '8125624', 'C', 'A'),
        gene_name='Wdr13',
        amino_acids='KLQGHSAPVLDVIVNCDESLLASSD',
        mutant_amino_acid_start_offset=12,
        mutant_amino_acid_end_offset=13,
        n_overlapping_reads=71,
        n_alt_reads=25,
        n_ref_reads=46,
        n_alt_reads_supporting_protein_sequence=2,
        supporting_reference_transcripts=[wdr13_transcript])

    # throws an error for each prediction, make sure vaxrank doesn't fall down
    class FakeMHCPredictor:
        def predict_subsequences(self, x):
            raise ValueError('I throw an error in your general direction')

    epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(),
                                           protein_fragment=protein_fragment,
                                           genome=genome)

    eq_(0, len(epitope_predictions))
def test_sequence_key_with_reading_frame_insertion():
    # Insert nucleotide "T" after second codon of TP53-001, the
    # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on
    # the negative strand we have to take the reverse complement of the variant
    # which turns it into 'CTC'>'CTCA'
    tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38)

    tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0]
    # Sequence of TP53 around boundary of 2nd/3rd codons
    # with 10 context nucleotides:
    #   last 4 nt of 5' UTR: TGCC
    #   start codon: ATG (translates to M)
    #   2nd codon: GAG (translates to E)
    #   <---- insertion variant occurs between these two codons
    #   3rd codon: GAG
    #   4th codon: CCG
    #   5th codon:  CAG
    #   first nt of 6th codon: T

    result = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=tp53_insertion, transcript=tp53_001, context_size=10)

    expected = ReferenceCodingSequenceKey(
        strand="-",
        sequence_before_variant_locus="TGCCATGGAG",
        sequence_at_variant_locus="",
        sequence_after_variant_locus="GAGCCGCAGT",
        offset_to_first_complete_codon=4,
        contains_start_codon=True,
        overlaps_start_codon=True,
        contains_five_prime_utr=True,
        amino_acids_before_variant="ME")
    eq_(result, expected)
Exemple #34
0
def variants_from_csv(csv_file, sample_id=None, reference=None):
    """Variants from csv file.
    
    Args:
        csv_file: csv file with following column names-
            chromosome, position, reference_allele, alt_allele, gene_name, transcript_id, sample_id
        sample_id: if provided, select variants only for this id
        reference: ref genome used for variant calling
    """

    from pyensembl import ensembl_grch38
    import varcode
    from varcode import Variant
    df = pd.read_csv(csv_file)
    variants = []
    if sample_id != None and 'sample_id' in df.columns:
        df = df[df.sample_id == sample_id]
        df = df.drop_duplicates(['POS', 'REF', 'ALT'])
    for i, r in list(df.iterrows()):
        #print i
        v = Variant(contig=r.CHROM,
                    start=r.POS,
                    ref=r.REF,
                    alt=r.ALT,
                    ensembl=ensembl_grch38)
        variants.append(v)
    varcl = varcode.variant_collection.VariantCollection(variants)
    return varcl
def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
    """Get contig, position, ref and alt data from the genotypes table,
    and get the best effect from Varcode library. Return a list of the form:
    [[contig, position, "NAME,NAME,..."], [contig...], ...]
    """
    results = select([
            genotypes.c.contig,
            genotypes.c.position,
            genotypes.c.reference,
            genotypes.c.alternates
        ]).where(genotypes.c.vcf_id == vcf_id).execute()

    ensembl_rel = EnsemblRelease(ensembl_release_num)

    varcode_annotations = []
    for contig, position, reference, alternates in results:
        variant = Variant(contig=contig,
                          start=position,
                          ref=reference.encode('ascii','ignore'),
                          alt=alternates.encode('ascii','ignore'),
                          ensembl=ensembl_rel)

        # This will give us a single, yet relevant effect
        best_effect = variant.effects().top_priority_effect()
        gene_name = best_effect.gene_name
        transcript = best_effect.transcript_id
        if best_effect.__class__.__name__ == "Intragenic":
            notation = "intragenic"
        else:
            notation = best_effect.short_description
        effect_type = type(best_effect).__name__
        # Make it human readable
        effect_type = re.sub("([a-z])([A-Z])","\g<1> \g<2>", effect_type)
        varcode_annotations.append([contig,
                                    position,
                                    reference,
                                    alternates,
                                    gene_name,
                                    transcript,
                                    notation,
                                    effect_type])

    return varcode_annotations
def validate_transcript_mutation(
        ensembl_transcript_id,
        chrom,
        dna_position,
        dna_ref,
        dna_alt,
        aa_pos,
        aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects
        if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
        # exonic splice site mutations carry with them an alternate effect
        # which is what we check against dbNSFP (since that database seemed
        # to ignore exonic splicing mutations)
        effect = effect.alternate_effect

    assert isinstance(effect, Substitution), \
        "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
            aa_pos, aa_alt, effect)
    effect_aa_pos = effect.aa_mutation_start_offset
    effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
    assert (
        effect_aa_pos + 1 == aa_pos and
        effect_aa_alt == aa_alt), \
        "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
            aa_alt,
            aa_pos,
            chrom,
            dna_position,
            dna_ref,
            dna_alt,
            effect)
Exemple #37
0
def generate_random_missense_variants(num_variants=10, max_search=100000, reference="GRCh37"):
    """
    Generate a random collection of missense variants by trying random variants repeatedly.
    """
    variants = []
    for i in range(max_search):
        bases = ["A", "C", "T", "G"]
        random_ref = choice(bases)
        bases.remove(random_ref)
        random_alt = choice(bases)
        random_contig = choice(["1", "2", "3", "4", "5"])
        random_variant = Variant(contig=random_contig, start=randint(1, 1000000),
                                 ref=random_ref, alt=random_alt, ensembl=reference)
        try:
            effects = random_variant.effects()
            for effect in effects:
                if isinstance(effect, Substitution):
                    variants.append(random_variant)
                    break
        except:
            continue
        if len(variants) == num_variants:
            break
    return VariantCollection(variants)
def make_inputs_for_tp53_201_variant(
        cdna_prefix="ATG",
        n_bad_nucleotides_at_start=0,
        mismatches=0,
        reference_context_size=3):
    """
    Parameters
    ----------
    cdna_prefix : str
        Transcript nucleotides before the variant that we're pretending
        got detected from RNA-seq reads.

    n_bad_nucleotides_at_start : int
        Number of nucleotides we expect to get trimmed from the
        beginning of the variant sequence while matching to a reference context.

    mismatches : int
        Expected number of nucleotide mismatches in the result

    reference_context_size : int
        Number of nucleotides before the variant locus to try matching
        against a reference transcript.
    """
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")

    cdna_alt = "A"
    cdna_suffix = "AGGAGCCGCAGTCAGAT"

    # genomic DNA is the reverse complement of the cDNA
    # for TP53-001 since it's on the negative strand
    gdna_prefix = reverse_complement_dna(cdna_suffix)
    gdna_alt = reverse_complement_dna(cdna_alt)
    gdna_suffix = reverse_complement_dna(cdna_prefix)

    # variant sequence supported by two reads
    # one fully spanning the variant sequence
    # and another missing the last nucleotide
    fully_overlapping_read = AlleleRead(
        prefix=gdna_prefix,
        allele=gdna_alt,
        suffix=gdna_suffix,
        name="full-overlap")
    # testing the prefix and allele to make sure they have the expected
    # TP53-201 sequence but the suffix might change depending on what's
    # passed in as cdna_prefix
    eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(fully_overlapping_read.allele, "T")

    partially_overlapping_read = AlleleRead(
        prefix=gdna_prefix,
        allele=gdna_alt,
        suffix=gdna_suffix[:-1],
        name="partial-overlap")
    eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(partially_overlapping_read.allele, "T")

    variant_sequence = VariantSequence(
        prefix=gdna_prefix,
        alt=gdna_alt,
        suffix=gdna_suffix,
        reads=[fully_overlapping_read, partially_overlapping_read])
    assert isinstance(variant_sequence, VariantSequence)

    prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start

    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant,
        transcript=transcript,
        context_size=reference_context_size)
    assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey)

    reference_context = ReferenceContext.from_reference_coding_sequence_key(
        key=reference_coding_sequence_key,
        variant=variant,
        transcripts=[transcript])
    assert isinstance(reference_context, ReferenceContext)

    expected = VariantSequenceInReadingFrame(
        cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix,
        offset_to_first_complete_codon=prefix_length % 3,
        variant_cdna_interval_start=prefix_length,
        variant_cdna_interval_end=prefix_length + 1,
        reference_cdna_sequence_before_variant="ATG"[-prefix_length:],
        number_mismatches=mismatches)
    assert isinstance(expected, VariantSequenceInReadingFrame)

    return variant_sequence, reference_context, expected
def test_mm10_Klf6_frameshift():
    variant = Variant("chr13", 5864876, "", "G", "GRCm38")
    effects = variant.effects()
    eq_(len(effects), 1)
    validate_effect_values(effects[0])