def test_specific_variant_mouse_with_ensembl_genome(): # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons? # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109 variant = Variant( contig=11, start=101177240, ref="G", alt="T", ensembl=ensembl_mouse_genome) effects = variant.effects() eq_(len(effects), 2) substitution_effects = [ effect for effect in effects if isinstance(effect, Substitution) ] eq_(len(substitution_effects), 1) substitution_effect = substitution_effects[0] # The coding sequence through the sub: # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC # (The final G is the sub: the 77th nucleotide) # TGC (C) -> TTC (F) # 78 / 3 = 26 # 0-base = 25 eq_(substitution_effect.mutant_protein_sequence[25], "F") eq_(substitution_effect.original_protein_sequence[25], "C")
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite): # exonic splice site mutations carry with them an alternate effect # which is what we check against dbNSFP (since that database seemed # to ignore exonic splicing mutations) effect = effect.alternate_effect assert isinstance(effect, Substitution), \ "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( aa_pos, aa_alt, effect) effect_aa_pos = effect.aa_mutation_start_offset effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( aa_alt, aa_pos, chrom, dna_position, dna_ref, dna_alt, effect)
def generate_random_missense_variants(num_variants=10, max_search=100000, reference="GRCh37"): """ Generate a random collection of missense variants by trying random variants repeatedly. """ variants = [] for i in range(max_search): bases = ["A", "C", "T", "G"] random_ref = choice(bases) bases.remove(random_ref) random_alt = choice(bases) random_contig = choice(["1", "2", "3", "4", "5"]) random_variant = Variant(contig=random_contig, start=randint(1, 1000000), ref=random_ref, alt=random_alt, ensembl=reference) try: effects = random_variant.effects() for effect in effects: if isinstance(effect, Substitution): variants.append(random_variant) break except: continue if len(variants) == num_variants: break return VariantCollection(variants)
def test_STAT1_stop_gain_at_exon_boundary(): # top priority effect for this variant should be PrematureStop, # even though it's also ExonicSpliceSite stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37") effects = stat1_variant.effects() print(effects) assert any([e.__class__ is ExonicSpliceSite for e in effects]) top_effect = effects.top_priority_effect() print(top_effect) assert top_effect.__class__ is PrematureStop
def test_HRAS_G13V_in_cancer_driver_genes_and_variants(): HRAS_G13V = Variant("11", 534285, "C", "A", "GRCh37") effect = HRAS_G13V.effects().top_priority_effect() eq_(effect.gene.name, "HRAS") eq_(effect.short_description, "p.G13V") gene_pathway_check = GenePathwayCheck() variant_info = gene_pathway_check.make_variant_dict(HRAS_G13V) assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME] assert not variant_info[_CLASS_I_MHC_COLUMN_NAME] assert variant_info[_DRIVER_VARIANT_COLUMN_NAME] assert variant_info[_DRIVER_GENE_COLUMN_NAME]
def test_HRAS_G13C_in_cancer_driver_genes(): HRAS_G13C = Variant("11", 534286, "C", "A", "GRCh37") effect = HRAS_G13C.effects().top_priority_effect() eq_(effect.gene.name, "HRAS") eq_(effect.short_description, "p.G13C") gene_pathway_check = GenePathwayCheck() variant_info = gene_pathway_check.make_variant_dict(HRAS_G13C) assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME] assert not variant_info[_CLASS_I_MHC_COLUMN_NAME] # even though it's a RAS G13 variant, it's not actually that common # and thus didn't make the threshold for our source dataset assert not variant_info[_DRIVER_VARIANT_COLUMN_NAME] assert variant_info[_DRIVER_GENE_COLUMN_NAME]
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id): variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl) effects = variant.effects() transcript_dict = effects.top_priority_effect_per_transcript_id() assert transcript_id in transcript_dict, \ "Expected transcript ID %s for variant %s not found in %s" % ( transcript_id, variant, transcript_dict) effect = transcript_dict[transcript_id] # COSMIC seems to ignore exonic splice sites if isinstance(effect, ExonicSpliceSite): return effect.alternate_effect else: return effect
def test_Varcode(self): variants = ( # chr, start_pos, reference allele, alternate allele, worst mutation effect (17, 7573996, 'A', 'G', 'Substitution'), (2, 198283615, 'C', 'G', 'IntronicSpliceSite'), (19, 47503648, 'G', 'A', 'PrematureStop'), (14, 69256615, 'CGGTGGCAGCGG', '', 'Deletion'), (5, 112175217, 'A', '', 'FrameShift')) for var in variants: var_poss = Variant(contig=var[0], start=var[1], ref=var[2], alt=var[3], ensembl=ensembl_grch37) self.assertEqual( var_poss.effects().top_priority_effect().__class__.__name__, var[4])
def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num): """Get contig, position, ref and alt data from the genotypes table, and get the best effect from Varcode library. Return a list of the form: [[contig, position, "NAME,NAME,..."], [contig...], ...] """ results = select([ genotypes.c.contig, genotypes.c.position, genotypes.c.reference, genotypes.c.alternates ]).where(genotypes.c.vcf_id == vcf_id).execute() ensembl_rel = EnsemblRelease(ensembl_release_num) varcode_annotations = [] for contig, position, reference, alternates in results: variant = Variant(contig=contig, start=position, ref=reference.encode('ascii','ignore'), alt=alternates.encode('ascii','ignore'), ensembl=ensembl_rel) # This will give us a single, yet relevant effect best_effect = variant.effects().top_priority_effect() gene_name = best_effect.gene_name transcript = best_effect.transcript_id if best_effect.__class__.__name__ == "Intragenic": notation = "intragenic" else: notation = best_effect.short_description effect_type = type(best_effect).__name__ # Make it human readable effect_type = re.sub("([a-z])([A-Z])","\g<1> \g<2>", effect_type) varcode_annotations.append([contig, position, reference, alternates, gene_name, transcript, notation, effect_type]) return varcode_annotations
def validate_transcript_mutation( ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite): # exonic splice site mutations carry with them an alternate effect # which is what we check against dbNSFP (since that database seemed # to ignore exonic splicing mutations) effect = effect.alternate_effect assert isinstance(effect, Substitution), \ "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( aa_pos, aa_alt, effect) effect_aa_pos = effect.aa_mutation_start_offset effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( aa_alt, aa_pos, chrom, dna_position, dna_ref, dna_alt, effect)
def get_variant_classification(contig, start, ref, alt, genome=ensembl_grch38): try: var = Variant(contig=contig, start=start, ref=ref, alt=alt, ensembl=genome) top_effect = var.effects().top_priority_effect() consequence = top_effect.__class__.__name__ weight = CONSEQUENCE_WEIGHTING.get(consequence, 0) except Exception: consequence = 'Unclassified' weight = 0 finally: if len(ref) > len(alt): return 'Deletion', consequence, weight elif len(ref) < len(alt): return 'Insertion', consequence, weight else: return 'Mismatch', consequence, weight
def test_mm10_Klf6_frameshift(): variant = Variant("chr13", 5864876, "", "G", "mm10") effects = variant.effects().drop_silent_and_noncoding() eq_(len(effects), 1) validate_effect_values(effects[0])
def test_mm10_Klf6_frameshift(): variant = Variant("chr13", 5864876, "", "G", "GRCm38") effects = variant.effects() eq_(len(effects), 1) validate_effect_values(effects[0])
def create_epitope_varcode(chrm, start, ref, alt, db, transcript): """ This function computes and return the epitope for a given variant using the package Varcode (Ensembl) :param chrm: the chromosome :param start: the start position :param ref: the original sequence :param alt: the mutated sequence :param db: the Ensembl database to use :param transcript: the transcript ID :return: a epitope (position, error flags, original sequence, mutated sequence) """ # Retrieve variant info vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db) effect = [effect for effect in vinfo.effects() if effect.transcript_id == transcript][0] errors = "Flags:" wt_mer = '-' mut_mer = '-' pos = -1 if effect is None: errors += ' could not infer the effect' else: # Retrieve effect type protein_mut = effect.short_description if protein_mut is None: errors += ' could not retrieve AA mutation' elif not protein_mut.startswith('p.'): errors += ' invalid mutation {}'.format(protein_mut) elif protein_mut.startswith('p.X'): errors += ' mutation occurs in stop codon' else: # Retrieve pos pos = effect.aa_mutation_start_offset if pos is None: errors += ' could not find the position for this mutation' elif pos == 0: errors += ' can not code for this mutated position' elif pos == 1: errors += ' mutation occurs in start codon' else: if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None: errors += ' could not retrieve protein sequence' else: # Type of effect effect_type = type(effect).__name__ if 'Stop' in effect_type: errors += ' stop mutation' elif 'FrameShift' in effect_type: wt_mer = effect.original_protein_sequence[pos - 12:pos + 13] mut_mer = effect.mutant_protein_sequence[pos - 12:] elif 'Substitution' in effect_type \ or 'Deletion' in effect_type: wt_mer = effect.original_protein_sequence[pos - 12:pos + 13] mut_mer = effect.mutant_protein_sequence[pos - 12:pos + 13] elif 'Insertion' in effect_type: size = int(abs(len(ref) - len(alt)) / 3) wt_mer = effect.original_protein_sequence[pos - 12:pos + 13 + size] mut_mer = effect.mutant_protein_sequence[pos - 12:pos + 13 + size] else: errors += ' unknown exonic function {}'.format(effect_type) return pos, errors, wt_mer, mut_mer