Ejemplo n.º 1
0
def test_specific_variant_mouse_with_ensembl_genome():
    # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
    # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
    variant = Variant(
        contig=11,
        start=101177240,
        ref="G",
        alt="T",
        ensembl=ensembl_mouse_genome)
    effects = variant.effects()
    eq_(len(effects), 2)
    substitution_effects = [
        effect
        for effect in effects
        if isinstance(effect, Substitution)
    ]
    eq_(len(substitution_effects), 1)
    substitution_effect = substitution_effects[0]
    # The coding sequence through the sub:
    # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
    # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
    # (The final G is the sub: the 77th nucleotide)
    # TGC (C) -> TTC (F)
    # 78 / 3 = 26
    # 0-base = 25
    eq_(substitution_effect.mutant_protein_sequence[25], "F")
    eq_(substitution_effect.original_protein_sequence[25], "C")
Ejemplo n.º 2
0
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position,
                                 dna_ref, dna_alt, aa_pos, aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
        # exonic splice site mutations carry with them an alternate effect
        # which is what we check against dbNSFP (since that database seemed
        # to ignore exonic splicing mutations)
        effect = effect.alternate_effect

    assert isinstance(effect, Substitution), \
        "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
            aa_pos, aa_alt, effect)
    effect_aa_pos = effect.aa_mutation_start_offset
    effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
    assert (
        effect_aa_pos + 1 == aa_pos and
        effect_aa_alt == aa_alt), \
            "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
                aa_alt,
                aa_pos,
                chrom,
                dna_position,
                dna_ref,
                dna_alt,
                effect)
Ejemplo n.º 3
0
def test_specific_variant_mouse_with_ensembl_genome():
    # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
    # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
    variant = Variant(
        contig=11,
        start=101177240,
        ref="G",
        alt="T",
        ensembl=ensembl_mouse_genome)
    effects = variant.effects()
    eq_(len(effects), 2)
    substitution_effects = [
        effect
        for effect in effects
        if isinstance(effect, Substitution)
    ]
    eq_(len(substitution_effects), 1)
    substitution_effect = substitution_effects[0]
    # The coding sequence through the sub:
    # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
    # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
    # (The final G is the sub: the 77th nucleotide)
    # TGC (C) -> TTC (F)
    # 78 / 3 = 26
    # 0-base = 25
    eq_(substitution_effect.mutant_protein_sequence[25], "F")
    eq_(substitution_effect.original_protein_sequence[25], "C")
Ejemplo n.º 4
0
def generate_random_missense_variants(num_variants=10,
                                      max_search=100000,
                                      reference="GRCh37"):
    """
    Generate a random collection of missense variants by trying random variants repeatedly.
    """
    variants = []
    for i in range(max_search):
        bases = ["A", "C", "T", "G"]
        random_ref = choice(bases)
        bases.remove(random_ref)
        random_alt = choice(bases)
        random_contig = choice(["1", "2", "3", "4", "5"])
        random_variant = Variant(contig=random_contig,
                                 start=randint(1, 1000000),
                                 ref=random_ref,
                                 alt=random_alt,
                                 ensembl=reference)
        try:
            effects = random_variant.effects()
            for effect in effects:
                if isinstance(effect, Substitution):
                    variants.append(random_variant)
                    break
        except:
            continue
        if len(variants) == num_variants:
            break
    return VariantCollection(variants)
Ejemplo n.º 5
0
def test_STAT1_stop_gain_at_exon_boundary():
    # top priority effect for this variant should be PrematureStop,
    # even though it's also ExonicSpliceSite
    stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
    effects = stat1_variant.effects()
    print(effects)
    assert any([e.__class__ is ExonicSpliceSite for e in effects])
    top_effect = effects.top_priority_effect()
    print(top_effect)
    assert top_effect.__class__ is PrematureStop
Ejemplo n.º 6
0
def test_HRAS_G13V_in_cancer_driver_genes_and_variants():
    HRAS_G13V = Variant("11", 534285, "C", "A", "GRCh37")
    effect = HRAS_G13V.effects().top_priority_effect()
    eq_(effect.gene.name, "HRAS")
    eq_(effect.short_description, "p.G13V")
    gene_pathway_check = GenePathwayCheck()
    variant_info = gene_pathway_check.make_variant_dict(HRAS_G13V)
    assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME]
    assert not variant_info[_CLASS_I_MHC_COLUMN_NAME]
    assert variant_info[_DRIVER_VARIANT_COLUMN_NAME]
    assert variant_info[_DRIVER_GENE_COLUMN_NAME]
Ejemplo n.º 7
0
def test_HRAS_G13C_in_cancer_driver_genes():
    HRAS_G13C = Variant("11", 534286, "C", "A", "GRCh37")
    effect = HRAS_G13C.effects().top_priority_effect()
    eq_(effect.gene.name, "HRAS")
    eq_(effect.short_description, "p.G13C")
    gene_pathway_check = GenePathwayCheck()
    variant_info = gene_pathway_check.make_variant_dict(HRAS_G13C)
    assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME]
    assert not variant_info[_CLASS_I_MHC_COLUMN_NAME]
    # even though it's a RAS G13 variant, it's not actually that common
    # and thus didn't make the threshold for our source dataset
    assert not variant_info[_DRIVER_VARIANT_COLUMN_NAME]
    assert variant_info[_DRIVER_GENE_COLUMN_NAME]
Ejemplo n.º 8
0
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
    effects = variant.effects()
    transcript_dict = effects.top_priority_effect_per_transcript_id()
    assert transcript_id in transcript_dict, \
        "Expected transcript ID %s for variant %s not found in %s" % (
            transcript_id, variant, transcript_dict)
    effect = transcript_dict[transcript_id]

    # COSMIC seems to ignore exonic splice sites
    if isinstance(effect, ExonicSpliceSite):
        return effect.alternate_effect
    else:
        return effect
Ejemplo n.º 9
0
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
    effects = variant.effects()
    transcript_dict = effects.top_priority_effect_per_transcript_id()
    assert transcript_id in transcript_dict, \
        "Expected transcript ID %s for variant %s not found in %s" % (
            transcript_id, variant, transcript_dict)
    effect = transcript_dict[transcript_id]

    # COSMIC seems to ignore exonic splice sites
    if isinstance(effect, ExonicSpliceSite):
        return effect.alternate_effect
    else:
        return effect
Ejemplo n.º 10
0
    def test_Varcode(self):
        variants = (  # chr, start_pos, reference allele, alternate allele, worst mutation effect
            (17, 7573996, 'A', 'G', 'Substitution'), (2, 198283615, 'C', 'G',
                                                      'IntronicSpliceSite'),
            (19, 47503648, 'G', 'A', 'PrematureStop'),
            (14, 69256615, 'CGGTGGCAGCGG', '', 'Deletion'), (5, 112175217, 'A',
                                                             '', 'FrameShift'))

        for var in variants:
            var_poss = Variant(contig=var[0],
                               start=var[1],
                               ref=var[2],
                               alt=var[3],
                               ensembl=ensembl_grch37)
            self.assertEqual(
                var_poss.effects().top_priority_effect().__class__.__name__,
                var[4])
Ejemplo n.º 11
0
def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
    """Get contig, position, ref and alt data from the genotypes table,
    and get the best effect from Varcode library. Return a list of the form:
    [[contig, position, "NAME,NAME,..."], [contig...], ...]
    """
    results = select([
            genotypes.c.contig,
            genotypes.c.position,
            genotypes.c.reference,
            genotypes.c.alternates
        ]).where(genotypes.c.vcf_id == vcf_id).execute()

    ensembl_rel = EnsemblRelease(ensembl_release_num)

    varcode_annotations = []
    for contig, position, reference, alternates in results:
        variant = Variant(contig=contig,
                          start=position,
                          ref=reference.encode('ascii','ignore'),
                          alt=alternates.encode('ascii','ignore'),
                          ensembl=ensembl_rel)

        # This will give us a single, yet relevant effect
        best_effect = variant.effects().top_priority_effect()
        gene_name = best_effect.gene_name
        transcript = best_effect.transcript_id
        if best_effect.__class__.__name__ == "Intragenic":
            notation = "intragenic"
        else:
            notation = best_effect.short_description
        effect_type = type(best_effect).__name__
        # Make it human readable
        effect_type = re.sub("([a-z])([A-Z])","\g<1> \g<2>", effect_type)
        varcode_annotations.append([contig,
                                    position,
                                    reference,
                                    alternates,
                                    gene_name,
                                    transcript,
                                    notation,
                                    effect_type])

    return varcode_annotations
Ejemplo n.º 12
0
def validate_transcript_mutation(
        ensembl_transcript_id,
        chrom,
        dna_position,
        dna_ref,
        dna_alt,
        aa_pos,
        aa_alt):
    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
    effects = variant.effects()
    transcript_id_dict = {
        effect.transcript.id: effect
        for effect in effects
        if isinstance(effect, TranscriptMutationEffect)
    }
    assert ensembl_transcript_id in transcript_id_dict, \
        "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
    effect = transcript_id_dict[ensembl_transcript_id]

    if isinstance(effect, ExonicSpliceSite):
        # exonic splice site mutations carry with them an alternate effect
        # which is what we check against dbNSFP (since that database seemed
        # to ignore exonic splicing mutations)
        effect = effect.alternate_effect

    assert isinstance(effect, Substitution), \
        "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
            aa_pos, aa_alt, effect)
    effect_aa_pos = effect.aa_mutation_start_offset
    effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
    assert (
        effect_aa_pos + 1 == aa_pos and
        effect_aa_alt == aa_alt), \
        "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
            aa_alt,
            aa_pos,
            chrom,
            dna_position,
            dna_ref,
            dna_alt,
            effect)
Ejemplo n.º 13
0
 def get_variant_classification(contig,
                                start,
                                ref,
                                alt,
                                genome=ensembl_grch38):
     try:
         var = Variant(contig=contig,
                       start=start,
                       ref=ref,
                       alt=alt,
                       ensembl=genome)
         top_effect = var.effects().top_priority_effect()
         consequence = top_effect.__class__.__name__
         weight = CONSEQUENCE_WEIGHTING.get(consequence, 0)
     except Exception:
         consequence = 'Unclassified'
         weight = 0
     finally:
         if len(ref) > len(alt):
             return 'Deletion', consequence, weight
         elif len(ref) < len(alt):
             return 'Insertion', consequence, weight
         else:
             return 'Mismatch', consequence, weight
Ejemplo n.º 14
0
def generate_random_missense_variants(num_variants=10, max_search=100000, reference="GRCh37"):
    """
    Generate a random collection of missense variants by trying random variants repeatedly.
    """
    variants = []
    for i in range(max_search):
        bases = ["A", "C", "T", "G"]
        random_ref = choice(bases)
        bases.remove(random_ref)
        random_alt = choice(bases)
        random_contig = choice(["1", "2", "3", "4", "5"])
        random_variant = Variant(contig=random_contig, start=randint(1, 1000000),
                                 ref=random_ref, alt=random_alt, ensembl=reference)
        try:
            effects = random_variant.effects()
            for effect in effects:
                if isinstance(effect, Substitution):
                    variants.append(random_variant)
                    break
        except:
            continue
        if len(variants) == num_variants:
            break
    return VariantCollection(variants)
Ejemplo n.º 15
0
def test_mm10_Klf6_frameshift():
    variant = Variant("chr13", 5864876, "", "G", "mm10")
    effects = variant.effects().drop_silent_and_noncoding()
    eq_(len(effects), 1)
    validate_effect_values(effects[0])
Ejemplo n.º 16
0
def test_mm10_Klf6_frameshift():
    variant = Variant("chr13", 5864876, "", "G", "GRCm38")
    effects = variant.effects()
    eq_(len(effects), 1)
    validate_effect_values(effects[0])
Ejemplo n.º 17
0
def create_epitope_varcode(chrm, start, ref, alt, db, transcript):
    """
    This function computes and return the epitope for a given variant using the
    package Varcode (Ensembl)
    :param chrm: the chromosome
    :param start: the start position
    :param ref: the original sequence
    :param alt: the mutated sequence
    :param db: the Ensembl database to use
    :param transcript: the transcript ID
    :return: a epitope (position, error flags, original sequence, mutated sequence)
    """
    # Retrieve variant info
    vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db)
    effect = [effect for effect in vinfo.effects() if effect.transcript_id == transcript][0]
    errors = "Flags:"
    wt_mer = '-'
    mut_mer = '-'
    pos = -1
    if effect is None:
        errors += ' could not infer the effect'
    else:
        # Retrieve effect type
        protein_mut = effect.short_description
        if protein_mut is None:
            errors += ' could not retrieve AA mutation'
        elif not protein_mut.startswith('p.'):
            errors += ' invalid mutation {}'.format(protein_mut)
        elif protein_mut.startswith('p.X'):
            errors += ' mutation occurs in stop codon'
        else:
            # Retrieve pos
            pos = effect.aa_mutation_start_offset
            if pos is None:
                errors += ' could not find the position for this mutation'
            elif pos == 0:
                errors += ' can not code for this mutated position'
            elif pos == 1:
                errors += ' mutation occurs in start codon'
            else:
                if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None:
                    errors += ' could not retrieve protein sequence'
                else:
                    # Type of effect
                    effect_type = type(effect).__name__
                    if 'Stop' in effect_type:
                        errors += ' stop mutation'
                    elif 'FrameShift' in effect_type:
                        wt_mer = effect.original_protein_sequence[pos - 12:pos + 13]
                        mut_mer = effect.mutant_protein_sequence[pos - 12:]
                    elif 'Substitution' in effect_type \
                            or 'Deletion' in effect_type:
                        wt_mer = effect.original_protein_sequence[pos - 12:pos + 13]
                        mut_mer = effect.mutant_protein_sequence[pos - 12:pos + 13]
                    elif 'Insertion' in effect_type:
                        size = int(abs(len(ref) - len(alt)) / 3)
                        wt_mer = effect.original_protein_sequence[pos - 12:pos + 13 + size]
                        mut_mer = effect.mutant_protein_sequence[pos - 12:pos + 13 + size]
                    else:
                        errors += ' unknown exonic function {}'.format(effect_type)
    return pos, errors, wt_mer, mut_mer
Ejemplo n.º 18
0
def test_mm10_Klf6_frameshift():
    variant = Variant("chr13", 5864876, "", "G", "GRCm38")
    effects = variant.effects()
    eq_(len(effects), 1)
    validate_effect_values(effects[0])