def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def testRealWorld(self):
        """Test that the full COSMIC datasource can retrieve entries by both gp and gpp."""
        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        cosmicDS = TestUtils.createCosmicDatasource(self.config)

        # These values are not taken from a real world scenario, but are cooked for this test.

        m = MutationData()
        m.chr = '1'
        m.start = '12941796'
        m.end = '12941796'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0')

        #1	150483621	150483621
        m = MutationData()
        m.chr = '1'
        m.start = '150483621'
        m.end = '150483621'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)
Beispiel #3
0
    def testSpliceSiteWithinNBases(self):
        """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """
        # chr21:10,998,326-10,998,346
        # 10,998,336 is a splice site.  (Junction between 10998335 and 336)
        # AGTTCTCCTT C TGGAAAAAAG
        refs = 'AGTTCTCCTTCTGGAAAAAAG'
        alts = 'TCAGACTGAAAATACCCCCCT'
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        vcs = []
        for s in range(10998326, 10998347):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr = "21"
            m.ref_allele = refs[s - 10998326]
            m.alt_allele = alts[s - 10998326]

            m = gafDatasource.annotate_mutation(m)

            distanceFromSpliceSite = abs(10998336 - int(m.start))
            vc = m['variant_classification']
            self.assertTrue(vc != 'Silent', 'Silent mutation found when it should be a splice site.')

            vcs.append(vc)
            print vc + "  " + m.start

        self.assertTrue(all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12]))
        self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8]))
        self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))
Beispiel #4
0
    def testSilentMutationGoingToSpliceSite(self):
        """Test that a silent mutation within 10 bp of a splice junction should become a splice site"""
        #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94
        #

        refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA"
        alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA"
        vcs = []
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        numSpliceSites = 0
        numSilent = 0
        startWindow = 28233780
        for s in range(startWindow, 28233806):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr = "1"
            m.ref_allele = refs[s - startWindow]
            m.alt_allele = alts[s - startWindow]

            m = gafDatasource.annotate_mutation(m)

            distanceFromSpliceSite = abs(28233793 - int(m.start))
            vc = m['variant_classification']
            vcs.append(vc)
            # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.')

            if vc.lower() == "splice_site":
                numSpliceSites += 1
            if vc.lower() == "silent":
                numSilent += 1
            print vc + "  " + m.start + "  " + str(distanceFromSpliceSite)

        self.assertTrue(numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites))
        self.assertTrue(numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
Beispiel #5
0
    def testFlank(self):
        """Test that we can see a Flank mutation."""
        #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94
        #

        refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA"
        alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA"
        vcs = []
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        numSpliceSites = 0
        numSilent = 0
        startWindow = 11042200
        for s in range(startWindow, startWindow+len(refs)):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr="1"
            m.ref_allele = refs[s-startWindow]
            m.alt_allele = alts[s-startWindow]

            m = gafDatasource.annotate_mutation(m)

            vc = m['variant_classification']
            vcs.append(vc)

            print vc + "  " + m.start

        pass
Beispiel #6
0
    def test_effect_tx_mode(self):
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)

        # Canonical mutation was Intron
        m = MutationDataFactory.default_create()
        m.chr = '2'
        m.start = '219137340'
        m.end = '219137340'
        m.ref_allele = 'G'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['gene'] == "PNKD")
        self.assertTrue(m['variant_classification'] == "Missense_Mutation")

        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL)
        m = MutationDataFactory.default_create()
        m.chr = '2'
        m.start = '219137340'
        m.end = '219137340'
        m.ref_allele = 'G'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['gene'] == "PNKD")
        self.assertTrue(m['variant_classification'] == "Intron", "Canonical no longer is Intron.  This test is no longer valid.  This failure can come up when changing the GAF datasource.")
    def test_effect_tx_mode(self):
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)

        # Canonical mutation was Intron
        m = MutationDataFactory.default_create()
        m.chr = '2'
        m.start = '219137340'
        m.end = '219137340'
        m.ref_allele = 'G'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['gene'] == "PNKD")
        self.assertTrue(m['variant_classification'] == "Missense_Mutation")

        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL)
        m = MutationDataFactory.default_create()
        m.chr = '2'
        m.start = '219137340'
        m.end = '219137340'
        m.ref_allele = 'G'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['gene'] == "PNKD")
        self.assertTrue(
            m['variant_classification'] == "Intron",
            "Canonical no longer is Intron.  This test is no longer valid.  This failure can come up when changing the GAF datasource."
        )
    def test_denovo(self):
        """GAF de novo test """
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221737)
        m.chr = "22"
        m.ref_allele = ''
        m.alt_allele = 'CAT'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(
            m['variant_classification'] == 'De_novo_Start_OutOfFrame')

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221740)
        m.chr = "22"
        m.ref_allele = ''
        m.alt_allele = 'AACATAA'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(
            m['variant_classification'] == 'De_novo_Start_OutOfFrame')

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221739)
        m.chr = "22"
        m.ref_allele = ''
        m.alt_allele = 'ACATAA'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
Beispiel #9
0
    def test_denovo(self):
        """GAF de novo test """
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221737)
        m.chr="22"
        m.ref_allele = ''
        m.alt_allele = 'CAT'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame')

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221740)
        m.chr="22"
        m.ref_allele = ''
        m.alt_allele = 'AACATAA'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['variant_classification'] == 'De_novo_Start_OutOfFrame')

        m = MutationDataFactory.default_create()
        m.start = str(22221735)
        m.end = str(22221739)
        m.chr="22"
        m.ref_allele = ''
        m.alt_allele = 'ACATAA'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['variant_classification'] == 'De_novo_Start_InFrame')
Beispiel #10
0
    def testFlank(self):
        """Test that we can see a Flank mutation."""
        #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94
        #

        refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA"
        alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA"
        vcs = []
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        numSpliceSites = 0
        numSilent = 0
        startWindow = 11042200
        for s in range(startWindow, startWindow + len(refs)):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr = "1"
            m.ref_allele = refs[s - startWindow]
            m.alt_allele = alts[s - startWindow]

            m = gafDatasource.annotate_mutation(m)

            vc = m['variant_classification']
            vcs.append(vc)

            print vc + "  " + m.start

        pass
Beispiel #11
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
    def testRealWorld(self):
        """Test that the full COSMIC datasource can retrieve entries by both gp and gpp."""
        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        cosmicDS = TestUtils.createCosmicDatasource(self.config)

        # These values are not taken from a real world scenario, but are cooked for this test.

        m = MutationDataFactory.default_create()
        m.chr = '1'
        m.start = '12941796'
        m.end = '12941796'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0')

        #1	150483621	150483621
        m = MutationDataFactory.default_create()
        m.chr = '1'
        m.start = '150483621'
        m.end = '150483621'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
Beispiel #14
0
 def testExonRetrievalForGene(self):
     """Make sure that the GAF datasource can retrieve exons, given a gene"""
     testGeneList = ['CEBPA', 'BRCA1', 'PIK3CA']
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     for gene in testGeneList:
         exons = gafDatasource.retrieveExons(gene, isCodingOnly=True)
         self.assertTrue(exons is not None)
         print(str(exons))
Beispiel #15
0
 def testExonRetrievalForGene(self):
     """Make sure that the GAF datasource can retrieve exons, given a gene"""
     testGeneList = ['CEBPA', 'BRCA1', 'PIK3CA']
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     for gene in testGeneList:
         exons = gafDatasource.retrieveExons(gene, isCodingOnly=True)
         self.assertTrue(exons is not None)
         print(str(exons))
Beispiel #16
0
    def testChangeInTxModeChangesHashcode(self):
        """Test that a change in the tx-mode will change the hashcode"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        old_hashcode = gafDatasource.get_hashcode()
        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL)
        new_hashcode = gafDatasource.get_hashcode()
        self.assertTrue(old_hashcode != new_hashcode)
Beispiel #17
0
 def testVersionHeader(self):
     """ This method simply tests that the version string returned by the annotator does not cause an exception.
         Minimal checking that the returned sting is actually correct.
         Does not attempt to initialize input or output.  Only a gaf datasource.
      """
     annotator = Annotator()
     annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
     tmp = annotator.createHeaderString()
     self.assertTrue(tmp.find("Gaf ") != -1 or tmp.find("GENCODE") != -1, "Could not find Gaf or GENCODE version in header string.")
     self.assertTrue(tmp.find("Oncotator") != -1, "Could not find the word Oncotator in header string.")
    def testBasicDatasourceSorting(self):
        """Test that the GAF datasource is sorted before a gene-based datasource"""

        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        
        incorrectSortList = [geneDS, gafDatasource]
        guessSortList =  DatasourceFactory.sortDatasources(incorrectSortList)
        self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.")
        self.assertTrue(len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
Beispiel #19
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations()    
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(len(unknownAnnotations) == 0, "Unknown annotations exist in mutation: " + str(unknownAnnotations))
Beispiel #20
0
 def testVersionHeader(self):
     """ This method simply tests that the version string returned by the annotator does not cause an exception.
         Minimal checking that the returned sting is actually correct.
         Does not attempt to initialize input or output.  Only a gaf datasource.
      """
     annotator = Annotator()
     annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
     tmp = annotator.createHeaderString()
     self.assertTrue(tmp.find("Gaf ") != -1 or tmp.find("GENCODE") != -1, "Could not find Gaf or GENCODE version in header string.")
     self.assertTrue(tmp.find("Oncotator") != -1, "Could not find the word Oncotator in header string.")
Beispiel #21
0
    def testChangeInTxModeChangesHashcode(self):
        """Test that a change in the tx-mode will change the hashcode"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        old_hashcode = gafDatasource.get_hashcode()
        gafDatasource.set_tx_mode(TranscriptProvider.TX_MODE_CANONICAL)
        new_hashcode = gafDatasource.get_hashcode()
        self.assertTrue(old_hashcode != new_hashcode)
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
Beispiel #23
0
    def test_start_codon(self):
        """Test a start codon hit in a GAF datasource"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        m = MutationDataFactory.default_create()
        m.start = str(22221729)
        m.end = str(22221729)
        m.chr="22"
        m.ref_allele = 'A'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['variant_classification'] == VariantClassification.MISSENSE)
Beispiel #24
0
 def testAKT1(self):
     """ Test that this version of the GAF produces the up to date gene for a position given from a website user.
     """
     m = MutationDataFactory.default_create()
     m.chr = '14'
     m.start = '105246407'
     m.end = '105246407'
     m.ref_allele = 'G'
     m.alt_allele = 'A'
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     m = gafDatasource.annotate_mutation(m)
     self.assertTrue(m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] + "  If updating GAF, this may not be an error, but should be confirmed manually.")
Beispiel #25
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator('testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations() 
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(False, "Erroneous exception was thrown: " + str(e) + "\n" + traceback.format_exc())
         self.assertTrue(m['gene'] != '')
Beispiel #26
0
    def testMC1R(self):
        """Test that this version of the GAF produces a MC1R, instead of TUBB gene"""
        m = MutationDataFactory.default_create()
        m.chr = '16'
        m.start = '89985913'
        m.end = '89985913'
        m.ref_allele = 'G'
        m.alt_allele = 'A'
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        m = gafDatasource.annotate_mutation(m)

        # At some point, we would expect this to be MC1R, not TUBB3
        self.assertTrue(m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] + "  If updating GAF, this may not be an error, but should be confirmed manually.")
Beispiel #27
0
 def testExonRetrievalForGenesFromFile(self):
     """Make sure that the GAF datasource can retrieve exons, given a list of genes from a simple file"""
     inputGeneList = file('testdata/testGeneList.txt', 'r')
     outputFileFP = file("out/testGeneListExons.txt", 'w')
     errorFileFP = file("out/testGeneListExons.err.txt", 'w')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     for line in inputGeneList:
         gene = line.strip()
         exons = gafDatasource.retrieveExons(gene, isCodingOnly=True)
         if len(exons) == 0:
             errorFileFP.write("Could not locate " + gene + "\n")
         for e in exons:
             outputFileFP.write('%s\t%s\t%s\t%s\n' % (e[0], e[1], e[2], e[3]))
Beispiel #28
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()  
        ctr = 0  
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput, "Gaf data source altered mutation count.")
Beispiel #29
0
    def test_start_codon(self):
        """Test a start codon hit in a GAF datasource"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        m = MutationDataFactory.default_create()
        m.start = str(22221729)
        m.end = str(22221729)
        m.chr = "22"
        m.ref_allele = 'A'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(
            m['variant_classification'] == VariantClassification.MISSENSE)
Beispiel #30
0
    def testFlank2(self):
        """Test a second real-world flank scenario"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        # 1	228646357 nearest Gene=HIST3H2A C>T
        m = MutationDataFactory.default_create()
        m.start = str(228646357)
        m.end = str(228646357)
        m.chr="1"
        m.ref_allele = 'C'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)

        self.assertTrue(m['gene'] == "HIST3H2A", "Wrong gene (GT: HIST3H2A): " + m['gene'] + "   -- if updating GAF, this test may fail as this gene may not be appropriate.")
        self.assertTrue(m['variant_classification'] == "5'Flank", "Should be 5'Flank, but was " + m['variant_classification'] + " -- if updating GAF, this test may fail as this test is data specific.  Also, this may fail if padding parameters are changed.")
Beispiel #31
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(
             len(unknownAnnotations) == 0,
             "Unknown annotations exist in mutation: " +
             str(unknownAnnotations))
Beispiel #32
0
 def testExonRetrievalForGenesFromFile(self):
     """Make sure that the GAF datasource can retrieve exons, given a list of genes from a simple file"""
     inputGeneList = file('testdata/testGeneList.txt', 'r')
     outputFileFP = file("out/testGeneListExons.txt", 'w')
     errorFileFP = file("out/testGeneListExons.err.txt", 'w')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     for line in inputGeneList:
         gene = line.strip()
         exons = gafDatasource.retrieveExons(gene, isCodingOnly=True)
         if len(exons) == 0:
             errorFileFP.write("Could not locate " + gene + "\n")
         for e in exons:
             outputFileFP.write('%s\t%s\t%s\t%s\n' %
                                (e[0], e[1], e[2], e[3]))
Beispiel #33
0
    def testMicroRNA(self):
        """Test proper annotation of miRNA
        """
        #uc021qwk.1	chr12:31379258-31379277:-	hsa-miR-3194-3p|?	chr12:31379258-31379277:-		Confidence=100
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        m = MutationDataFactory.default_create()
        m.start = 31379268
        m.end = 31379268
        m.chr= "12"
        m.alt_allele = 'G'

        # This is accurate
        m.ref_allele = 'A'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(m['gene'].lower() == "hsa-mir-3194-3p", "Wrong gene (GT: hsa-mir-3194-3p): " + m['gene'] + "   -- if updating GAF, this test may fail as this result may not be appropriate.")
Beispiel #34
0
    def testSimpleAnnotationWithExampleVcf(self):
        """
        Tests the ability to do a simple Gaf 3.0 annotation.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "simpleVCF.Gaf.annotated.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename, [])
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config))
        annotator.annotate()
Beispiel #35
0
 def testAKT1(self):
     """ Test that this version of the GAF produces the up to date gene for a position given from a website user.
     """
     m = MutationDataFactory.default_create()
     m.chr = '14'
     m.start = '105246407'
     m.end = '105246407'
     m.ref_allele = 'G'
     m.alt_allele = 'A'
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     m = gafDatasource.annotate_mutation(m)
     self.assertTrue(
         m['gene'] == "AKT1", "Incorrect gene found: " + m['gene'] +
         "  If updating GAF, this may not be an error, but should be confirmed manually."
     )
    def testBasicDatasourceSorting(self):
        """Test that the GAF datasource is sorted before a gene-based datasource"""

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")

        incorrectSortList = [geneDS, gafDatasource]
        guessSortList = DatasourceFactory.sortDatasources(incorrectSortList)
        self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.")
        self.assertTrue(
            len(guessSortList) == 2,
            "Sorting altered number of datasources (gt: 2): " +
            str(len(guessSortList)))
Beispiel #37
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(
            inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()
        ctr = 0
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput,
                         "Gaf data source altered mutation count.")
Beispiel #38
0
    def testMC1R(self):
        """Test that this version of the GAF produces a MC1R, instead of TUBB gene"""
        m = MutationDataFactory.default_create()
        m.chr = '16'
        m.start = '89985913'
        m.end = '89985913'
        m.ref_allele = 'G'
        m.alt_allele = 'A'
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        m = gafDatasource.annotate_mutation(m)

        # At some point, we would expect this to be MC1R, not TUBB3
        self.assertTrue(
            m['gene'] == "TUBB3", "Incorrect gene found: " + m['gene'] +
            "  If updating GAF, this may not be an error, but should be confirmed manually."
        )
Beispiel #39
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(
                 False, "Erroneous exception was thrown: " + str(e) + "\n" +
                 traceback.format_exc())
         self.assertTrue(m['gene'] != '')
Beispiel #40
0
    def testMicroRNA(self):
        """Test proper annotation of miRNA
        """
        #uc021qwk.1	chr12:31379258-31379277:-	hsa-miR-3194-3p|?	chr12:31379258-31379277:-		Confidence=100
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        m = MutationDataFactory.default_create()
        m.start = 31379268
        m.end = 31379268
        m.chr = "12"
        m.alt_allele = 'G'

        # This is accurate
        m.ref_allele = 'A'
        m = gafDatasource.annotate_mutation(m)
        self.assertTrue(
            m['gene'].lower() == "hsa-mir-3194-3p",
            "Wrong gene (GT: hsa-mir-3194-3p): " + m['gene'] +
            "   -- if updating GAF, this test may fail as this result may not be appropriate."
        )
Beispiel #41
0
    def testSilentMutationGoingToSpliceSite(self):
        """Test that a silent mutation within 10 bp of a splice junction should become a splice site"""
        #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94
        #

        refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA"
        alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA"
        vcs = []
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        numSpliceSites = 0
        numSilent = 0
        startWindow = 28233780
        for s in range(startWindow, 28233806):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr = "1"
            m.ref_allele = refs[s - startWindow]
            m.alt_allele = alts[s - startWindow]

            m = gafDatasource.annotate_mutation(m)

            distanceFromSpliceSite = abs(28233793 - int(m.start))
            vc = m['variant_classification']
            vcs.append(vc)
            # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.')

            if vc.lower() == "splice_site":
                numSpliceSites += 1
            if vc.lower() == "silent":
                numSilent += 1
            print vc + "  " + m.start + "  " + str(distanceFromSpliceSite)

        self.assertTrue(
            numSpliceSites == 4,
            "Should have seen 4 splice site mutations, but saw: " +
            str(numSpliceSites))
        self.assertTrue(
            numSilent == 11,
            "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
    def testBasicAnnotation(self):
        """ Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. """
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
        )
        outputFilename = "out/genericTranscriptTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        ''' 
        
        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B']
        
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'
        
        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()
        
        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)
        
        fields = tsvReader.getFieldNames()
        self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header")
        
        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr))
                linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1
            ctr = ctr + 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
Beispiel #44
0
    def testFlank2(self):
        """Test a second real-world flank scenario"""
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        # 1	228646357 nearest Gene=HIST3H2A C>T
        m = MutationDataFactory.default_create()
        m.start = str(228646357)
        m.end = str(228646357)
        m.chr = "1"
        m.ref_allele = 'C'
        m.alt_allele = 'T'
        m = gafDatasource.annotate_mutation(m)

        self.assertTrue(
            m['gene'] == "HIST3H2A",
            "Wrong gene (GT: HIST3H2A): " + m['gene'] +
            "   -- if updating GAF, this test may fail as this gene may not be appropriate."
        )
        self.assertTrue(
            m['variant_classification'] == "5'Flank",
            "Should be 5'Flank, but was " + m['variant_classification'] +
            " -- if updating GAF, this test may fail as this test is data specific.  Also, this may fail if padding parameters are changed."
        )
Beispiel #45
0
    def testSpliceSiteWithinNBases(self):
        """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """
        # chr21:10,998,326-10,998,346
        # 10,998,336 is a splice site.  (Junction between 10998335 and 336)
        # AGTTCTCCTT C TGGAAAAAAG
        refs = 'AGTTCTCCTTCTGGAAAAAAG'
        alts = 'TCAGACTGAAAATACCCCCCT'
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        vcs = []
        for s in range(10998326, 10998347):
            m = MutationDataFactory.default_create()
            m.start = str(s)
            m.end = str(s)
            m.chr = "21"
            m.ref_allele = refs[s - 10998326]
            m.alt_allele = alts[s - 10998326]

            m = gafDatasource.annotate_mutation(m)

            distanceFromSpliceSite = abs(10998336 - int(m.start))
            vc = m['variant_classification']
            self.assertTrue(
                vc != 'Silent',
                'Silent mutation found when it should be a splice site.')

            vcs.append(vc)
            print vc + "  " + m.start

        self.assertTrue(
            all([tmp == "Splice_Site" for tmp in vcs[8:12]]),
            "Not all vcs within 2 bases were splice site: " + str(vcs[8:12]))
        self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]),
                        "No splice sites should be seen: " + str(vcs[0:8]))
        self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]),
                        "No splice sites should be seen: " + str(vcs[12:20]))
    def testBasicAnnotation(self):
        """ Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        """

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            "ABL1",
            "ABL2",
            "ACSL3",
            "AF15Q14",
            "AF1Q",
            "AF3p21",
            "AF5q31",
            "AKAP9",
            "AKT1",
            "AKT2",
            "ALDH2",
            "ALK",
            "ALO17",
            "APC",
            "ARHGEF12",
            "ARHH",
            "ARID1A",
            "ARID2",
            "ARNT",
            "ASPSCR1",
            "ASXL1",
            "ATF1",
            "ATIC",
            "ATM",
            "ATRX",
            "BAP1",
            "BCL10",
            "BCL11A",
            "BCL11B",
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/"
        )
        outputFilename = "out/genericGeneTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            "CGC_Abridged_Other Syndrome/Disease" in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header",
        )
        self.assertTrue(
            "CGC_Abridged_Mutation Type" in fields, "'CGC_Abridged_Mutation Type' was not present in the header"
        )

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue("gene" in lineDict.keys())
            if lineDict["gene"] in genesAvailable:
                self.assertTrue(
                    lineDict["CGC_Abridged_GeneID"] != "",
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr),
                )
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
    def testRealWorld(self):
        """Test that the full COSMIC datasource can retrieve entries by both gp and gpp."""
        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        cosmicDS = TestUtils.createCosmicDatasource(self.config)

        # These values are not taken from a real world scenario, but are cooked for this test.

        m = MutationDataFactory.default_create()
        m.chr = '1'
        m.start = '12941796'
        m.end = '12941796'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0')

        # # 7:140481411-140481411
        m = MutationDataFactory.default_create()
        m.chr = '7'
        m.start = '140481411'
        m.end = '140481411'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        # As a reminder, the COSMIC datasource does not check the ref and alt allele for a match. (14)
        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '22')

        # # 7:140481411-140481411
        m = MutationDataFactory.default_create()
        m.chr = '7'
        m.start = '140481411'
        m.end = '140481411'
        m.ref_allele = "G"
        m.alt_allele = "A"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        # As a reminder, the COSMIC datasource does not check the ref and alt allele for a match. (7)
        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '22')

        # Test a gpp record
        m = MutationDataFactory.default_create()
        m.createAnnotation("gene", "ABL1")
        m.createAnnotation("transcript_protein_position_start", "255")
        m.createAnnotation("transcript_protein_position_end", "255")

        # These values (chr, start, and end) are incorrect.  This is by design -- just being used as dummy values.
        m.chr = '9'
        m.start = '1'
        m.end = '1'

        m = cosmicDS.annotate_mutation(m)

        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | egrep "[A-Z]255[A-Z]" | wc
        # 157
        #
        # The rest:
        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | uniq | egrep "_"
        # p.H295_P296insH	9:133747575-133747576
        # p.K356_K357insE	9:133748407-133748408
        # p.K357_N358insK	9:133748407-133748408
        # p.L184_K274del	9:133738150-133738422
        # p.L248_K274del
        #
        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | egrep "_K274"
        # p.L184_K274del	9:133738150-133738422
        # p.L184_K274del	9:133738150-133738422
        # p.L248_K274del
        # p.L248_K274del
        #
        # four overlap p.255 + 157 that are exact matches for p.255.
        # 155 + 4 were hematopoietic_and_lymphoid_tissue

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == str(157 + 4))
        self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('255') != -1)
        self.assertTrue(
            m['COSMIC_overlapping_mutation_AAs'].find('p.L184_K274del') != -1)
        self.assertTrue(
            m['COSMIC_overlapping_mutation_AAs'].find('p.L248_K274del') != -1)
        self.assertTrue(
            m['COSMIC_overlapping_primary_sites'].find(
                "haematopoietic_and_lymphoid_tissue(159)") != -1,
            "Did not have the correct primary sites annotation (haematopoietic_and_lymphoid_tissue(159)): "
            + m['COSMIC_overlapping_primary_sites'])
    def testRealWorld(self):
        """Test that the full COSMIC datasource can retrieve entries by both gp and gpp."""
        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        cosmicDS = TestUtils.createCosmicDatasource(self.config)

        # These values are not taken from a real world scenario, but are cooked for this test.

        m = MutationDataFactory.default_create()
        m.chr = '1'
        m.start = '12941796'
        m.end = '12941796'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '0')

        # # 7:140481411-140481411
        m = MutationDataFactory.default_create()
        m.chr = '7'
        m.start = '140481411'
        m.end = '140481411'
        m.ref_allele = "G"
        m.alt_allele = "T"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        # As a reminder, the COSMIC datasource does not check the ref and alt allele for a match. (14)
        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '22')

        # # 7:140481411-140481411
        m = MutationDataFactory.default_create()
        m.chr = '7'
        m.start = '140481411'
        m.end = '140481411'
        m.ref_allele = "G"
        m.alt_allele = "A"
        m = gafDS.annotate_mutation(m)
        m = cosmicDS.annotate_mutation(m)

        # As a reminder, the COSMIC datasource does not check the ref and alt allele for a match. (7)
        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '22')

        # Test a gpp record
        m = MutationDataFactory.default_create()
        m.createAnnotation("gene", "ABL1")
        m.createAnnotation("transcript_protein_position_start", "255")
        m.createAnnotation("transcript_protein_position_end", "255")

        # These values (chr, start, and end) are incorrect.  This is by design -- just being used as dummy values.
        m.chr = '9'
        m.start = '1'
        m.end = '1'

        m = cosmicDS.annotate_mutation(m)

        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | egrep "[A-Z]255[A-Z]" | wc
        # 157
        #
        # The rest:
        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | uniq | egrep "_"
        # p.H295_P296insH	9:133747575-133747576
        # p.K356_K357insE	9:133748407-133748408
        # p.K357_N358insK	9:133748407-133748408
        # p.L184_K274del	9:133738150-133738422
        # p.L248_K274del
        #
        # $ grep -P "^ABL1\t" ~/broad_oncotator_configs/CosmicCompleteTargetedScreensMutantExport.tsv/v76/CosmicCompleteTargetedScreensMutantExport.tsv | cut -f 19,24 | sort | egrep "_K274"
        # p.L184_K274del	9:133738150-133738422
        # p.L184_K274del	9:133738150-133738422
        # p.L248_K274del
        # p.L248_K274del
        #
        # four overlap p.255 + 157 that are exact matches for p.255.
        # 155 + 4 were hematopoietic_and_lymphoid_tissue

        self.assertTrue(m['COSMIC_n_overlapping_mutations'] == str(157 + 4))
        self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('255') != -1)
        self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('p.L184_K274del') != -1)
        self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('p.L248_K274del') != -1)
        self.assertTrue(m['COSMIC_overlapping_primary_sites'].find("haematopoietic_and_lymphoid_tissue(159)") != -1, "Did not have the correct primary sites annotation (haematopoietic_and_lymphoid_tissue(159)): " + m['COSMIC_overlapping_primary_sites'])
Beispiel #49
0
 def _createGafDataSource(self):
     self.logger.info("Initializing gaf 3.0")
     return TestUtils.createTranscriptProviderDatasource(self.config)