def initializeMutFromAttributes(chrom, startPos, endPos, ref, alt, build): mut = MutationData(chrom, startPos, endPos, ref, alt, build) varType = MutUtils.determineVariantType(mut) if varType == "snp": # Snps mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == "del": # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == "ins": # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = str(22221735) m.end = str(22221737) m.chr="22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221740) m.chr="22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221739) m.chr="22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource.")
def testRealWorld(self): """Test that the full COSMIC datasource can retrieve entries by both gp and gpp.""" gafDS = TestUtils.createTranscriptProviderDatasource(self.config) cosmicDS = TestUtils.createCosmicDatasource(self.config) # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.chr = '1' m.start = '12941796' m.end = '12941796' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0') #1 150483621 150483621 m = MutationData() m.chr = '1' m.start = '150483621' m.end = '150483621' m.ref_allele = "G" m.alt_allele = "T" m = gafDS.annotate_mutation(m) m = cosmicDS.annotate_mutation(m)
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationData() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def test_denovo(self): """GAF de novo test """ gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationData() m.start = str(22221735) m.end = str(22221737) m.chr = "22" m.ref_allele = '' m.alt_allele = 'CAT' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221740) m.chr = "22" m.ref_allele = '' m.alt_allele = 'AACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == 'De_novo_Start_OutOfFrame') m = MutationData() m.start = str(22221735) m.end = str(22221739) m.chr = "22" m.ref_allele = '' m.alt_allele = 'ACATAA' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl', 'w')) m1 = MutationData() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationData() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def testMulticoreAnnotate(self): """Test a (too) simple annotating exercise from GAF on 2 cores""" gafDatasource = TestUtils.createGafDatasourceProxy(self.config) # Test pickling dump(gafDatasource, file('out/testGAFPickle.pkl','w')) m1 = MutationData() m1.chr = '3' m1.start = '178866811' m1.end = '178866811' m1.ref_allele = "A" m1.alt_allele = "C" m1.build = "hg19" m2 = MutationData() m2.chr = '3' m2.start = '178866812' m2.end = '178866812' m2.ref_allele = "A" m2.alt_allele = "C" m2.build = "hg19" p = LoggingPool(processes=2) result = p.map(annotate_mutation_global, [(gafDatasource, m1), (gafDatasource, m2)]) p.close() p.join() for r in result: self.assertTrue("transcript_id" in r.keys()) self.assertTrue("gene" in r.keys()) self.assertTrue(r["gene"] == "PIK3CA") self.assertTrue(result[0].start != result[1].start)
def test_effect_tx_mode(self): gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) # Canonical mutation was Intron m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue(m['variant_classification'] == "Missense_Mutation") gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL) m = MutationData() m.chr = '2' m.start = '219137340' m.end = '219137340' m.ref_allele = 'G' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "PNKD") self.assertTrue( m['variant_classification'] == "Intron", "Canonical no longer is Intron. This test is no longer valid. This failure can come up when changing the GAF datasource." )
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue( updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue( updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue( updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def initializeMutFromAttributes(chr, start, end, ref_allele, alt_allele, build): mut = MutationData(str(chr), str(start), str(end), ref_allele, alt_allele, str(build)) varType = TranscriptProviderUtils.infer_variant_type(mut.ref_allele, mut.alt_allele) if TranscriptProviderUtils.is_xnp(varType): # Snps and other xNPs mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue="") if varType == VariantClassification.VT_DEL: # deletion preceding_bases, updated_ref_allele, updated_start, updated_end =\ MutUtils.retrievePrecedingBasesForDeletions(mut) mut.ref_allele = updated_ref_allele mut["ref_allele"] = updated_ref_allele mut.alt_allele = "-" mut["alt_allele"] = "-" mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) elif varType == VariantClassification.VT_INS: # insertion preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut["ref_allele"] = "-" mut.alt_allele = updated_alt_allele mut["alt_allele"] = updated_alt_allele mut.start = updated_start mut["start"] = updated_start mut.end = updated_end mut["end"] = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) return mut
def test_continuous_exons_in_segments(self): """Test that all exons are accounted when annotating adjacent segments that skip an exon. """ # SPECC1L 10+ 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L 8- 22 16282318 POTEH 2- 24730543 SPECC1L 8- 433.0 -0.00781166374668759 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L-ADORA2A 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM seg1 = MutationData() seg1.chr = "22" seg1.start = "24734447" # Just passed the exon 9 (0-based) seg1.end = "41783674" seg2 = MutationData() seg2.chr = "22" seg2.start = "16282318" seg2.end = "24730543" # Just passed the exon 8 (0-based) segs = [seg1, seg2] # 'ENST00000314328.9' for GENCODE v19 chosen_tx, transcript_ds = self._get_chosen_tx_and_transcript_ds(seg1.chr, seg1.start) result_tuple = transcript_ds._determine_exons_affected_by_start(seg1.start, chosen_tx) self.assertTrue(result_tuple == (10, '+')) result_tuple = transcript_ds._determine_exons_affected_by_end(seg2.end, chosen_tx) self.assertTrue(result_tuple == (8, '-'))
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation( annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow+len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr="1" m.ref_allele = refs[s-startWindow] m.alt_allele = alts[s-startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testMixedAnnotation(self): """Test that the COSMIC datasource can retrieve entries by both gp and gpp.""" tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic( src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. # Line 9 should get picked up genomic coords # Lines 7,8 should get picked up by the protein position m = MutationData() m.createAnnotation("gene", "A2M") m.createAnnotation("transcript_protein_position_start", "1300") m.createAnnotation("transcript_protein_position_end", "1400") m.chr = '12' m.start = '9227220' m.end = '9227230' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '3') self.assertTrue( m['COSMIC_overlapping_mutation_AAs'].find('1229') != -1, "Could not find the entry specified by genomic coords.") self.assertTrue( m['COSMIC_overlapping_primary_sites'] == "lung(3)", "Did not have the correct primary sites annotation (lung(3)): " + m['COSMIC_overlapping_primary_sites'])
def test_validation_correction_valid(self): """ Test that the validation allele fields are determined automatically when not specified by the user for a valid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Valid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction2.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join("configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue(line_dict['Tumor_Validation_Allele1'] == line_dict['Reference_Allele'], "Tumor validation allele 1 did not match reference for a valid validation result.") self.assertTrue(line_dict['Tumor_Validation_Allele2'] == line_dict['Tumor_Seq_Allele2'], "Tumor validation allele 2 did not match Tumor_Seq_Allele2 for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele1'], "Tumor allele 1 did not match normal alleles for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" %(line_dict['Match_Norm_Validation_Allele1'] ,line_dict['Reference_Allele']) ) self.assertTrue("G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue("A" == line_dict['Tumor_Seq_Allele2'], "Alt allele should have been A, but was " + line_dict['Tumor_Seq_Allele2'])
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testSilentMutationGoingToSpliceSite(self): """Test that a silent mutation within 10 bp of a splice junction should become a splice site""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 28233780 for s in range(startWindow, 28233806): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(28233793 - int(m.start)) vc = m['variant_classification'] vcs.append(vc) # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.') if vc.lower() == "splice_site": numSpliceSites += 1 if vc.lower() == "silent": numSilent += 1 print vc + " " + m.start + " " + str(distanceFromSpliceSite) self.assertTrue(numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites)) self.assertTrue(numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
def testESPCoverageAnnotationWithSNPAvgMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075334" m1.end = "100075334" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testSpliceSiteWithinNBases(self): """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """ # chr21:10,998,326-10,998,346 # 10,998,336 is a splice site. (Junction between 10998335 and 336) # AGTTCTCCTT C TGGAAAAAAG refs = 'AGTTCTCCTTCTGGAAAAAAG' alts = 'TCAGACTGAAAATACCCCCCT' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) vcs = [] for s in range(10998326, 10998347): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "21" m.ref_allele = refs[s - 10998326] m.alt_allele = alts[s - 10998326] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(10998336 - int(m.start)) vc = m['variant_classification'] self.assertTrue(vc != 'Silent', 'Silent mutation found when it should be a splice site.') vcs.append(vc) print vc + " " + m.start self.assertTrue(all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow + len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch( self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075350" m1.end = "100075356" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth") cur_annotation = Annotation( value="91.25", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testRetrievePrecedingBaseFromAnnotationForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele)) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) updated_ref_allele, updated_alt_allele, updated_start = \ MutUtils.retrievePrecedingBaseFromAnnotationForInsertions(mut) self.assertTrue(updated_start == start, "Mut start should be %s but was %s." % (start, updated_start)) self.assertTrue(updated_ref_allele == ref_allele, "Ref allele should be %s but was %s." % (ref_allele, updated_ref_allele)) self.assertTrue(updated_alt_allele == alt_allele, "Alt allele should be %s but was %s." % (alt_allele, updated_alt_allele))
def test_validation_correction(self): """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Invalid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction1.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join( "configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue( line_dict['Tumor_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match normal alleles for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" % (line_dict['Match_Norm_Validation_Allele1'], line_dict['Reference_Allele'])) self.assertTrue( "G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue( "None" == line_dict['Mutation_Status'], "Mutation Status must be None when Validation Status is Invalid: " + line_dict['Mutation_Status'])
def test_start_codon(self): """Test a start codon hit in a GAF datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = str(22221729) m.end = str(22221729) m.chr="22" m.ref_allele = 'A' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['variant_classification'] == VariantClassification.MISSENSE)
def testAKT1(self): """ Test that this version of the GAF produces the up to date gene for a position given from a website user. """ m = MutationData() m.chr = '14' m.start = '105246407' m.end = '105246407' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually.")
def testRetrievePrecedingBasesForInsertions(self): chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234570 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "T", "Alt allele should be T but was %s." % mut.alt_allele) chrom = "1" start = 1234567 end = 1234567 # incorrect, but doesn't matter for the purposed of testing ref_allele = "GTC" alt_allele = "GTCTT" build = "19" mut = MutationData(chrom, start, end, ref_allele, alt_allele, build) preceding_bases, updated_alt_allele, updated_start, updated_end = \ MutUtils.retrievePrecedingBasesForInsertions(mut) mut.ref_allele = "-" mut.alt_allele = updated_alt_allele mut.start = updated_start mut.end = updated_end mut.createAnnotation(annotationName=MutUtils.PRECEDING_BASES_ANNOTATION_NAME, annotationValue=preceding_bases) self.assertTrue("_preceding_bases" in mut, "_preceding_bases is missing in the mutation data.") self.assertTrue(mut.start == 1234569, "Mut start should be 1234570 but was %s." % mut.start) self.assertTrue(mut.end == 1234570, "Mut end should be 1234571 but was %s." % mut.end) self.assertTrue(mut.ref_allele == "-", "Ref allele should be - but was %s." % mut.ref_allele) self.assertTrue(mut.alt_allele == "TT", "Alt allele should be TT but was %s." % mut.alt_allele)
def testdbNSFPAnnotationWithMissingExactMatch(self): # SNPs only """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join( *["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_6vars_exact_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35138" m1.end = "35138" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Integer", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="", datasourceName="dbNSFP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource('testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def testMC1R(self): """Test that this version of the GAF produces a MC1R, instead of TUBB gene""" m = MutationData() m.chr = '16' m.start = '89985913' m.end = '89985913' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = gafDatasource.annotate_mutation(m) # At some point, we would expect this to be MC1R, not TUBB3 self.assertTrue(m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually.")
def test_simple_annotate(self): ds = self._create_test_ds("testdata/small_tsv_leveldb/dbNSFP2.4_variant.chr1_cut5000.tsv", os.path.abspath("out/test_simple_annotate_snp_only_leveldb"), ["chr", "pos(1-coor)", "pos(1-coor)", "ref", "alt"]) m = MutationData() # 1 35138 T A m.chr = "1" m.start = "35138" m.end = "35138" m.ref_allele = "T" m.alt_allele = "A" m = ds.annotate_mutation(m) self.assertTrue(m['phyloP100way_vertebrate_rankscore'] == "0.19875")
def testESPCoverageAnnotationWithMissingIndelOverlapMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075300" m1.end = "100075336" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation( value="75.0|81.0|81.0", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation( value="692|692|692", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation( value="X|X|X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_small_positive_strand_transcript_change(self): """Test one location on a transcript and make sure that the transcript change rendered properly """ ds = TestUtils._create_test_gencode_ds("out/small_positive_strand_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1A>C", "Incorrect transcript change: " + m2['transcript_change']) # positive strand m = MutationData() m.chr = "3" m.start = "178916614" m.end = "178916614" m.ref_allele = "G" m.alt_allele = "T" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1G>T", "Incorrect transcript change: " + m2['transcript_change'])
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._tsvReader.getFieldNames() for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = MutationData(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def test_small_positive_strand_transcript_change(self): """Test one location on a transcript and make sure that the transcript change rendered properly """ ds = TestUtils._create_test_gencode_v19_ds("out/small_positive_strand_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1A>C", "Incorrect transcript change: " + m2['transcript_change']) # positive strand m = MutationData() m.chr = "3" m.start = "178916614" m.end = "178916614" m.ref_allele = "G" m.alt_allele = "T" m2 = ds.annotate_mutation(m) self.assertTrue(m2['transcript_change'] == "c.1G>T", "Incorrect transcript change: " + m2['transcript_change'])
def testEmptyAnswer(self): ''' The Reference Datasource should return a blank result if the chromosome is not found. Note: A log entry should also be written, but this is not tested. ''' self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found. Please add it.") ds = ReferenceDatasource('testdata/reference_ds') m = MutationData() m.chr = "THIS_DOES_NOT_EXIST" m.start = "11" m.end = "11" groundTruth = "" # remember that the annotate_mutation returns a generator, so we use an iterator guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationData() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue(m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def test_start_codon(self): """Test a start codon hit in a GAF datasource""" gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = MutationData() m.start = str(22221729) m.end = str(22221729) m.chr = "22" m.ref_allele = 'A' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue( m['variant_classification'] == VariantClassification.MISSENSE)
def test_simple_annotate_with_nonhuman(self): """Test a very simple annotation with a nonhuman genome (saccer)""" ensembl_ds = self._create_ensembl_ds_from_saccer() m = MutationData() m.chr = "I" m.start = "500" m.end = "500" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m) self.assertTrue(m2['annotation_transcript'] == "YAL069W") self.assertTrue(m2['gene'] == "YAL069W")
def testFlank2(self): """Test a second real-world flank scenario""" gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) # 1 228646357 nearest Gene=HIST3H2A C>T m = MutationData() m.start = str(228646357) m.end = str(228646357) m.chr="1" m.ref_allele = 'C' m.alt_allele = 'T' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'] == "HIST3H2A", "Wrong gene (GT: HIST3H2A): " + m['gene'] + " -- if updating GAF, this test may fail as this gene may not be appropriate.") self.assertTrue(m['variant_classification'] == "5'Flank", "Should be 5'Flank, but was " + m['variant_classification'] + " -- if updating GAF, this test may fail as this test is data specific. Also, this may fail if padding parameters are changed.")
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource( 'testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def testMicroRNA(self): """Test proper annotation of miRNA """ #uc021qwk.1 chr12:31379258-31379277:- hsa-miR-3194-3p|? chr12:31379258-31379277:- Confidence=100 gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) m = MutationData() m.start = 31379268 m.end = 31379268 m.chr= "12" m.alt_allele = 'G' # This is accurate m.ref_allele = 'A' m = gafDatasource.annotate_mutation(m) self.assertTrue(m['gene'].lower() == "hsa-mir-3194-3p", "Wrong gene (GT: hsa-mir-3194-3p): " + m['gene'] + " -- if updating GAF, this test may fail as this result may not be appropriate.")
def testSimpleGLAnnotate(self): ''' Test a simple annotation case. Make sure that the ref_context and gc_content annotations are correct. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "GL000211.1" m.start = "11" m.end = "11" groundTruth = "gaattctttttcaagtaagtc" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
def testSimpleRendering(self): m = MutationData() m.chr = '1' m.start = 1000000 m.end = 1000000 outputFilename = "out/simpleBEDTest.bed" outputRenderer = SimpleBedOutputRenderer(outputFilename) outputRenderer.renderMutations([m], Metadata()) fp = file(outputFilename,'r') mOut = fp.readline().strip().split(' ') self.assertTrue(mOut[0] == "chr1") self.assertTrue(mOut[1] == "999999") self.assertTrue(mOut[2] == "1000000") fp.close()
def test_hgvs_annotations_simple_SNP(self): """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_v19_ds("out/test_hgvs_annotations_SNP_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'ENSP00000215832:p.Met1Leu')
def testSimpleRendering(self): m = MutationData() m.chr = '1' m.start = 1000000 m.end = 1000000 outputFilename = "out/simpleBEDTest.bed" outputRenderer = SimpleBedOutputRenderer(outputFilename) outputRenderer.renderMutations([m], Metadata()) fp = file(outputFilename, 'r') mOut = fp.readline().strip().split(' ') self.assertTrue(mOut[0] == "chr1") self.assertTrue(mOut[1] == "999999") self.assertTrue(mOut[2] == "1000000") fp.close()
def test_hgvs_annotations_simple_SNP(self): """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_ds("out/test_hgvs_annotations_") # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'ENSP00000215832:p.Met1Leu')
def testAKT1(self): """ Test that this version of the GAF produces the up to date gene for a position given from a website user. """ m = MutationData() m.chr = '14' m.start = '105246407' m.end = '105246407' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = gafDatasource.annotate_mutation(m) self.assertTrue( m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually." )
def testBasicAnnotate(self): '''Test that the COSMIC datasource can be initialized with two index files (gp and gpp) and a simple annotation performed''' tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic(src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file= tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.createAnnotation("gene", "EGFR") m.createAnnotation("transcript_protein_position_start", "747") m.createAnnotation("transcript_protein_position_end", "747") m.chr = '7' m.start = '55259560' m.end = '55259560' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '2')
def test_no_mapping_file(self): """Test that we can still create (from scratch) and instantiate a EnsemblDatasource when no protein mapping is specified (i.e. limited HGVS support)""" """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_ds("out/test_hgvs_annotations_no_mapping_", protein_id_mapping_file=None) # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'unknown_prot_seq_id:p.Met1Leu')
def testMC1R(self): """Test that this version of the GAF produces a MC1R, instead of TUBB gene""" m = MutationData() m.chr = '16' m.start = '89985913' m.end = '89985913' m.ref_allele = 'G' m.alt_allele = 'A' gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) m = gafDatasource.annotate_mutation(m) # At some point, we would expect this to be MC1R, not TUBB3 self.assertTrue( m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] + " If updating GAF, this may not be an error, but should be confirmed manually." )
def testSimpleAnnotate(self): ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def test_no_mapping_file(self): """Test that we can still create (from scratch) and instantiate a EnsemblDatasource when no protein mapping is specified (i.e. limited HGVS support)""" """Test that HGVS annotations appear (incl. protein change) in a mutation, so we believe that the Transcript objects are populated properly.""" ds = TestUtils._create_test_gencode_v19_ds("out/test_hgvs_annotations_no_mapping_file_", protein_id_mapping_file=None) # Now for a negative strand m = MutationData() m.chr = "22" m.start = "22221730" m.end = "22221730" m.ref_allele = "T" m.alt_allele = "G" m.build = "hg19" m2 = ds.annotate_mutation(m) self.assertEqual(m2.get('HGVS_genomic_change', None), 'chr22.hg19:g.22221730T>G') self.assertEqual(m2.get('HGVS_coding_DNA_change', None), 'ENST00000215832.6:c.1A>C') self.assertEqual(m2.get('HGVS_protein_change', None), 'unknown_prot_seq_id:p.Met1Leu')
def test_protein_position_off_by_one(self, chrom, start, end, ref, alt, gt_prot_change): config = TestUtils.createUnitTestConfig() transcript_ds = TestUtils.createTranscriptProviderDatasource(config) cc_txs_fp = file("testdata/tx_exact_uniprot_matches.txt", 'r') cc_txs = [tx.rsplit(".", 1)[0] for tx in cc_txs_fp] cc_txs.append("ENST00000338368") # Add a transcript that is not exactly the same, but close cc_txs_fp.close() transcript_ds.set_custom_canonical_txs(cc_txs) m = MutationData() m.chr = chrom m.start = start m.end = end m.ref_allele = ref m.alt_allele = alt m2 = transcript_ds.annotate_mutation(m) self.assertEqual(m2['protein_change'], gt_prot_change)