def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M", ) reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3 ) eq_(expected, reference_coding_sequence_key)
def validate_transcript_mutation(ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite): # exonic splice site mutations carry with them an alternate effect # which is what we check against dbNSFP (since that database seemed # to ignore exonic splicing mutations) effect = effect.alternate_effect assert isinstance(effect, Substitution), \ "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( aa_pos, aa_alt, effect) effect_aa_pos = effect.aa_mutation_start_offset effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( aa_alt, aa_pos, chrom, dna_position, dna_ref, dna_alt, effect)
def test_specific_variant_mouse_with_ensembl_genome(): # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons? # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109 variant = Variant( contig=11, start=101177240, ref="G", alt="T", ensembl=ensembl_mouse_genome) effects = variant.effects() eq_(len(effects), 2) substitution_effects = [ effect for effect in effects if isinstance(effect, Substitution) ] eq_(len(substitution_effects), 1) substitution_effect = substitution_effects[0] # The coding sequence through the sub: # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC # (The final G is the sub: the 77th nucleotide) # TGC (C) -> TTC (F) # 78 / 3 = 26 # 0-base = 25 eq_(substitution_effect.mutant_protein_sequence[25], "F") eq_(substitution_effect.original_protein_sequence[25], "C")
def generate_random_missense_variants(num_variants=10, max_search=100000, reference="GRCh37"): """ Generate a random collection of missense variants by trying random variants repeatedly. """ variants = [] for i in range(max_search): bases = ["A", "C", "T", "G"] random_ref = choice(bases) bases.remove(random_ref) random_alt = choice(bases) random_contig = choice(["1", "2", "3", "4", "5"]) random_variant = Variant(contig=random_contig, start=randint(1, 1000000), ref=random_ref, alt=random_alt, ensembl=reference) try: effects = random_variant.effects() for effect in effects: if isinstance(effect, Substitution): variants.append(random_variant) break except: continue if len(variants) == num_variants: break return VariantCollection(variants)
def test_reference_coding_sequence_key_around_TP53_201_variant(): # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="ATG", sequence_at_variant_locus="G", sequence_after_variant_locus="AGG", offset_to_first_complete_codon=0, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="M") reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=3) eq_(expected, reference_coding_sequence_key)
def test_serialization(): variants = [ Variant(1, start=10, ref="AA", alt="AAT", ensembl=ensembl77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] for original in variants: # This causes the variant's ensembl object to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) assert original == reconstituted assert original.contig == reconstituted.contig assert original.ref == reconstituted.ref assert original.alt == reconstituted.alt assert original.start == reconstituted.start assert original.end == reconstituted.end # Test json. serialized = original.to_json() reconstituted = Variant.from_json(serialized) assert original == reconstituted
def test_serialization(): original = VariantCollection([ Variant( 1, start=10, ref="AA", alt="AAT", ensembl=77), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ]) original.metadata[original[0]] = {"a": "b"} original.metadata[original[2]] = {"bar": 2} # This causes the variants' ensembl objects to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]]) # Test json. serialized = original.to_json() reconstituted = VariantCollection.from_json(serialized) eq_(original, reconstituted) eq_(reconstituted[0], original[0]) eq_(reconstituted.metadata[original[0]], original.metadata[original[0]])
def test_serialization(): variants = [ Variant( 1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] for original in variants: # This causes the variant's ensembl object to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(original.contig, reconstituted.contig) eq_(original.ref, reconstituted.ref) eq_(original.alt, reconstituted.alt) eq_(original.start, reconstituted.start) eq_(original.end, reconstituted.end) eq_(original.original_ref, reconstituted.original_ref) eq_(original.original_alt, reconstituted.original_alt) eq_(original.original_start, reconstituted.original_start) # Test json. serialized = original.to_json() reconstituted = Variant.from_json(serialized) eq_(original, reconstituted)
def test_drop_duplicates(): ensembl = EnsemblRelease(78) v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) v2 = Variant("2", 10, "G", "T", ensembl=ensembl) collection_without_duplicates = VariantCollection( variants=[v1, v1, v1_copy, v2]) assert len(collection_without_duplicates) == 2
def test_contig_name_normalization(): eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1") eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1) # uppercase eq_(Variant( "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM") eq_(Variant( "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
def test_multiple_alleles_per_line(): variants = load_vcf(data_path("multiallelic.vcf")) assert len(variants) == 2, "Expected 2 variants but got %s" % variants variant_list = list(variants) expected_variants = [ Variant(1, 1431105, "A", "C", genome="GRCh37"), Variant(1, 1431105, "A", "G", genome="GRCh37"), ] eq_(set(variant_list), set(expected_variants))
def test_STAT1_stop_gain_at_exon_boundary(): # top priority effect for this variant should be PrematureStop, # even though it's also ExonicSpliceSite stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37") effects = stat1_variant.effects() print(effects) assert any([e.__class__ is ExonicSpliceSite for e in effects]) top_effect = effects.top_priority_effect() print(top_effect) assert top_effect.__class__ is PrematureStop
def test_silent_stop_codons(): silent_stop_codon_variants = { "ENST00000290524": Variant(1, start=151314663, ref="C", alt="T", genome=ensembl_grch37), "ENST00000368725": Variant(1, start=153409535, ref="C", alt="T", genome=ensembl_grch37), "ENST00000353479": Variant(10, start=105791994, ref="C", alt="T", genome=ensembl_grch37), } for transcript_id, variant in silent_stop_codon_variants.items(): yield (expect_effect, variant, transcript_id, Silent)
def test_HRAS_G13V_in_cancer_driver_genes_and_variants(): HRAS_G13V = Variant("11", 534285, "C", "A", "GRCh37") effect = HRAS_G13V.effects().top_priority_effect() eq_(effect.gene.name, "HRAS") eq_(effect.short_description, "p.G13V") gene_pathway_check = GenePathwayCheck() variant_info = gene_pathway_check.make_variant_dict(HRAS_G13V) assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME] assert not variant_info[_CLASS_I_MHC_COLUMN_NAME] assert variant_info[_DRIVER_VARIANT_COLUMN_NAME] assert variant_info[_DRIVER_GENE_COLUMN_NAME]
def test_snv_transition_transversion(): ref_variant = Variant(1, start=100, ref="C", alt="C") assert not ref_variant.is_snv variant = Variant(1, start=100, ref="C", alt="T") assert variant.is_snv assert variant.is_transition assert not variant.is_transversion transversion = Variant(1, start=100, ref="C", alt="A") assert transversion.is_snv assert not transversion.is_transition assert transversion.is_transversion
def test_HRAS_G13C_in_cancer_driver_genes(): HRAS_G13C = Variant("11", 534286, "C", "A", "GRCh37") effect = HRAS_G13C.effects().top_priority_effect() eq_(effect.gene.name, "HRAS") eq_(effect.short_description, "p.G13C") gene_pathway_check = GenePathwayCheck() variant_info = gene_pathway_check.make_variant_dict(HRAS_G13C) assert not variant_info[_IFNG_RESPONSE_COLUMN_NAME] assert not variant_info[_CLASS_I_MHC_COLUMN_NAME] # even though it's a RAS G13 variant, it's not actually that common # and thus didn't make the threshold for our source dataset assert not variant_info[_DRIVER_VARIANT_COLUMN_NAME] assert variant_info[_DRIVER_GENE_COLUMN_NAME]
def test_maf(): expected_tcga_ov_variants = [ Variant(1, 1650797, "A", "G", ensembl), Variant(1, 23836447, "C", "A", ensembl), Variant(1, 231401797, "A", "C", ensembl), Variant(11, 124617502, "C", "G", ensembl), ] eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants)) for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants): eq_(v_expect, v_maf) gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol'] assert any(gene.name == gene_name for gene in v_maf.genes), \ "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id): variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl) effects = variant.effects() transcript_dict = effects.top_priority_effect_per_transcript_id() assert transcript_id in transcript_dict, \ "Expected transcript ID %s for variant %s not found in %s" % ( transcript_id, variant, transcript_dict) effect = transcript_dict[transcript_id] # COSMIC seems to ignore exonic splice sites if isinstance(effect, ExonicSpliceSite): return effect.alternate_effect else: return effect
def test_multiple_variant_forms(): """ Load VCF, MAF and VariantCollection together. """ vcf_dir, cohort = None, None try: vcf_dir, cohort = make_cohort([FILE_FORMAT_1]) patient = cohort[0] patient.variants.append(data_path(MAF_FILE)) # Make sure listing the file twice has no effect. patient.variants.append(data_path(MAF_FILE)) variant = Variant(start=1000000, ref="A", alt="T", contig=1, ensembl=75) patient.variants.append(VariantCollection([variant])) cohort_variants = cohort.load_variants(patients=[patient]) # Make sure the VariantCollection was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1000000)), 1) # Make sure the VCF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 53513530)), 1) # Make sure the MAF was included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650797)), 1) # Make sure a non-existant variant is not included. eq_(len(cohort_variants[patient.id].filter(lambda v: v.start == 1650798)), 0) finally: if vcf_dir is not None and path.exists(vcf_dir): rmtree(vcf_dir) if cohort is not None: cohort.clear_caches()
def test_frameshift_near_start_of_BRCA1_001(): # # Insertion of genomic "A" after second codon of coding sequence. # # Transcript: BRCA1-001 (ENST00000357654) # Manually annotated using Ensembl release 85 # # Original mRNA coding sequnce: # ATG GAT TTA TCT GCT CTT CGC GTT GAA GAA GTA CAA # -M- -D- -L- -S- -A- -L- -A- -V- -E- -E- -V- -Q- # # After variant: # ATG GAT TTT ATC TGC TCT TCG CGT TGA # -M- -D- -F- -I- -C- -S- -S- -R- * variant = Variant("17", 43124096 - 6, ref="", alt="A", ensembl=ensembl_grch38) expect_effect(variant, transcript_id="ENST00000357654", effect_class=FrameShift, modifies_coding_sequence=True, modifies_protein_sequence=True, aa_alt="FICSSR")
def test_allele_count_dataframe(): variant = Variant("test_contig", 50, "C", "G") read_evidence = ReadEvidence(trimmed_base1_start=50, trimmed_ref="C", trimmed_alt="G", ref_reads=[ AlleleRead(prefix="AAA", allele="C", suffix="TTT", name="C1"), AlleleRead(prefix="AAC", allele="C", suffix="TTA", name="C2"), ], alt_reads=[ AlleleRead(prefix="AAA", allele="G", suffix="TTT", name="G1") ], other_reads=[]) df = allele_counts_dataframe([(variant, read_evidence)]) assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, ) row = df.iloc[0] eq_(row.num_ref_reads, 2) eq_(row.num_alt_reads, 1) eq_(row.num_other_reads, 0)
def test_locus_reads_substitution_longer(): # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG", # the alignment is interpreted as a C>G variant followed by an insertion of # another G variant = Variant("chromosome", 2, ref="C", alt="GG", normalize_contig_name=False) print(variant) pysam_read = make_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=1, base1_position_after_variant=3)) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, None, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=0, base0_read_position_after_variant=3) assert_equal_fields(read, expected)
def test_locus_reads_substitution_shorter(): # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence # "ACCTTG", for which the alignment is interpreted as a C>G variant # followed by the deletion of a C variant = Variant("chromosome", 2, ref="CC", alt="G", normalize_contig_name=False) print(variant) pysam_read = make_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=1, base1_position_after_variant=4)) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) print(reads) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=0, base0_read_position_after_variant=2) assert_equal_fields(read, expected)
def test_serialization(): variants = [ Variant( 1, start=10, ref="AA", alt="AAT", ensembl=ensembl_grch38), Variant(10, start=15, ref="A", alt="G"), Variant(20, start=150, ref="", alt="G"), ] for original in variants: # This causes the variant's ensembl object to make a SQL connection, # which makes the ensembl object non-serializable. By calling this # method, we are checking that we don't attempt to directly serialize # the ensembl object. original.effects() # Test pickling. serialized = pickle.dumps(original) reconstituted = pickle.loads(serialized) eq_(original, reconstituted) eq_(original.contig, reconstituted.contig) eq_(original.ref, reconstituted.ref) eq_(original.alt, reconstituted.alt) eq_(original.start, reconstituted.start) eq_(original.end, reconstituted.end) eq_(original.original_ref, reconstituted.original_ref) eq_(original.original_alt, reconstituted.original_alt) eq_(original.original_start, reconstituted.original_start) # Test json. serialized = original.to_json() reconstituted = Variant.from_json(serialized) eq_(original, reconstituted)
def test_locus_reads_snv(): """ test_partitioned_read_sequences_snv : Test that read gets correctly partitioned for chr1:4 T>G where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" variant = Variant("chromosome", 4, ref="T", alt="G", normalize_contig_name=False) pysam_read = make_read(seq="ACCGTG", cigar="6M", mdtag="3G2") samfile = DummySamFile(reads=[pysam_read]) reads = list( locus_read_generator(samfile=samfile, chromosome="chromosome", base1_position_before_variant=variant.start - 1, base1_position_after_variant=variant.start + 1)) print(reads) assert len(reads) == 1, \ "Expected to get back one read but instead got %d" % ( len(reads),) read = reads[0] expected = LocusRead(name=pysam_read.qname, sequence=pysam_read.query_sequence, reference_positions=[0, 1, 2, 3, 4, 5], quality_scores=pysam_read.query_qualities, base0_read_position_before_variant=2, base0_read_position_after_variant=4) assert_equal_fields(read, expected)
def test_sequence_key_with_reading_frame_deletion_with_five_prime_utr(): # Delete second codon of TP53-001, the surrounding context # includes nucleotides from the 5' UTR. Since TP53 is on the negative # strand we have to take the reverse complement of the variant which turns # it into 'CTC'>'' tp53_deletion = Variant("17", 7676589, "CTC", "", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around second codon with 10 context nucleotides: # In [51]: t.sequence[193-10:193+13] # Out[51]: 'CACTGCCATGGAGGAGCCGCAGT' # Which can be split into the following parts: # last 7 nt of 5' UTR: CACTGCC # start codon: ATG (translates to M) # 2nd codon: GAG <---- variant occurs here # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_deletion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="CACTGCCATG", sequence_at_variant_locus="GAG", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=7, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="M") eq_(result, expected)
def test_sequence_key_with_reading_frame_insertion_context_5nt_overlaps_start( ): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # a part of the start codon, thus the result shouldn't "contain" # the start codon but does "overlap" it. In the reverse complement # this variant becomes CTC>CTCA tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # last two nt of start codon: TG # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # first two nt of 4th codon: CC result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=5) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCC", offset_to_first_complete_codon=2, contains_start_codon=False, overlaps_start_codon=True, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_group_unique_sequences(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) print("%d variant reads: %s" % ( len(variant_reads), variant_reads)) groups = group_unique_sequences( variant_reads, max_prefix_size=30, max_suffix_size=30) print("%d unique sequences: %s" % ( len(groups), groups)) # there are some redundant reads, so we expect that the number of # unique entries should be less than the total read partitions assert len(variant_reads) > len(groups)
def test_sequence_key_with_reading_frame_insertion_context_3nt_no_start(): # Insert nucleotide "T" after second codon of TP53-001, # but in this test we're going to only keep enough context to see # the second codon (and no nucleotides from the start). In the reverse # complement this variant becomes CTC>CTCA. tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 6 context nucleotides: # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=3) expected = ReferenceCodingSequenceKey(strand="-", sequence_before_variant_locus="GAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAG", offset_to_first_complete_codon=0, contains_start_codon=False, overlaps_start_codon=False, contains_five_prime_utr=False, amino_acids_before_variant="E") eq_(result, expected)
def test_mhc_predictor_error(): genome = EnsemblRelease(species="mouse") wdr13_transcript = genome.transcripts_by_name("Wdr13-001")[0] protein_fragment = MutantProteinFragment( variant=Variant('X', '8125624', 'C', 'A'), gene_name='Wdr13', amino_acids='KLQGHSAPVLDVIVNCDESLLASSD', mutant_amino_acid_start_offset=12, mutant_amino_acid_end_offset=13, n_overlapping_reads=71, n_alt_reads=25, n_ref_reads=46, n_alt_reads_supporting_protein_sequence=2, supporting_reference_transcripts=[wdr13_transcript]) # throws an error for each prediction, make sure vaxrank doesn't fall down class FakeMHCPredictor: def predict_subsequences(self, x): raise ValueError('I throw an error in your general direction') epitope_predictions = predict_epitopes(mhc_predictor=FakeMHCPredictor(), protein_fragment=protein_fragment, genome=genome) eq_(0, len(epitope_predictions))
def test_sequence_key_with_reading_frame_insertion(): # Insert nucleotide "T" after second codon of TP53-001, the # surrounding context includes nucleotides from the 5' UTR. Since TP53 is on # the negative strand we have to take the reverse complement of the variant # which turns it into 'CTC'>'CTCA' tp53_insertion = Variant("17", 7676586, "CTC", "CTCA", ensembl_grch38) tp53_001 = ensembl_grch38.transcripts_by_name("TP53-001")[0] # Sequence of TP53 around boundary of 2nd/3rd codons # with 10 context nucleotides: # last 4 nt of 5' UTR: TGCC # start codon: ATG (translates to M) # 2nd codon: GAG (translates to E) # <---- insertion variant occurs between these two codons # 3rd codon: GAG # 4th codon: CCG # 5th codon: CAG # first nt of 6th codon: T result = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=tp53_insertion, transcript=tp53_001, context_size=10) expected = ReferenceCodingSequenceKey( strand="-", sequence_before_variant_locus="TGCCATGGAG", sequence_at_variant_locus="", sequence_after_variant_locus="GAGCCGCAGT", offset_to_first_complete_codon=4, contains_start_codon=True, overlaps_start_codon=True, contains_five_prime_utr=True, amino_acids_before_variant="ME") eq_(result, expected)
def variants_from_csv(csv_file, sample_id=None, reference=None): """Variants from csv file. Args: csv_file: csv file with following column names- chromosome, position, reference_allele, alt_allele, gene_name, transcript_id, sample_id sample_id: if provided, select variants only for this id reference: ref genome used for variant calling """ from pyensembl import ensembl_grch38 import varcode from varcode import Variant df = pd.read_csv(csv_file) variants = [] if sample_id != None and 'sample_id' in df.columns: df = df[df.sample_id == sample_id] df = df.drop_duplicates(['POS', 'REF', 'ALT']) for i, r in list(df.iterrows()): #print i v = Variant(contig=r.CHROM, start=r.POS, ref=r.REF, alt=r.ALT, ensembl=ensembl_grch38) variants.append(v) varcl = varcode.variant_collection.VariantCollection(variants) return varcl
def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num): """Get contig, position, ref and alt data from the genotypes table, and get the best effect from Varcode library. Return a list of the form: [[contig, position, "NAME,NAME,..."], [contig...], ...] """ results = select([ genotypes.c.contig, genotypes.c.position, genotypes.c.reference, genotypes.c.alternates ]).where(genotypes.c.vcf_id == vcf_id).execute() ensembl_rel = EnsemblRelease(ensembl_release_num) varcode_annotations = [] for contig, position, reference, alternates in results: variant = Variant(contig=contig, start=position, ref=reference.encode('ascii','ignore'), alt=alternates.encode('ascii','ignore'), ensembl=ensembl_rel) # This will give us a single, yet relevant effect best_effect = variant.effects().top_priority_effect() gene_name = best_effect.gene_name transcript = best_effect.transcript_id if best_effect.__class__.__name__ == "Intragenic": notation = "intragenic" else: notation = best_effect.short_description effect_type = type(best_effect).__name__ # Make it human readable effect_type = re.sub("([a-z])([A-Z])","\g<1> \g<2>", effect_type) varcode_annotations.append([contig, position, reference, alternates, gene_name, transcript, notation, effect_type]) return varcode_annotations
def validate_transcript_mutation( ensembl_transcript_id, chrom, dna_position, dna_ref, dna_alt, aa_pos, aa_alt): variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37) effects = variant.effects() transcript_id_dict = { effect.transcript.id: effect for effect in effects if isinstance(effect, TranscriptMutationEffect) } assert ensembl_transcript_id in transcript_id_dict, \ "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) effect = transcript_id_dict[ensembl_transcript_id] if isinstance(effect, ExonicSpliceSite): # exonic splice site mutations carry with them an alternate effect # which is what we check against dbNSFP (since that database seemed # to ignore exonic splicing mutations) effect = effect.alternate_effect assert isinstance(effect, Substitution), \ "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( aa_pos, aa_alt, effect) effect_aa_pos = effect.aa_mutation_start_offset effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] assert ( effect_aa_pos + 1 == aa_pos and effect_aa_alt == aa_alt), \ "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( aa_alt, aa_pos, chrom, dna_position, dna_ref, dna_alt, effect)
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", n_bad_nucleotides_at_start=0, mismatches=0, reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches : int Expected number of nucleotide mismatches in the result reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" cdna_suffix = "AGGAGCCGCAGTCAGAT" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead( prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], number_mismatches=mismatches) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected
def test_mm10_Klf6_frameshift(): variant = Variant("chr13", 5864876, "", "G", "GRCm38") effects = variant.effects() eq_(len(effects), 1) validate_effect_values(effects[0])