def test_simple_seg_file_input(self):
        """Test that we can read in a seg file, do no annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_simple_seg_file_input.tsv"
        if os.path.exists(output_filename):
            os.remove(output_filename)
        ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
        segs = ic.createMutations()

        i = 1
        for i,seg in enumerate(segs):
            pass

        self.assertTrue((i+1) == 27, "Found %d segments when there should have been 27." % (i+1))

        ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
        segs = ic.createMutations()


        outputRenderer = SimpleOutputRenderer(output_filename, '')
        outputRenderer.renderMutations(segs)

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
Beispiel #2
0
 def test_onp_ignore_indels(self):
     """make sure indels aren't being combined with onps"""
     file = 'testdata/maflite/onp.indel.maf.txt'
     input = OnpCombiner(MafliteInputMutationCreator(file))
     expected = list(MafliteInputMutationCreator(file).createMutations())
     onp_muts = [mut for mut in input.createMutations()]
     self.assert_mutations_match_expected(expected=expected, result=onp_muts)
    def testFullIndelVcf(self):
        """ Perform test of a Indel maflite all the way through TCGA VCF creation
        """
        outputFilename = "out/TCGAVCFTest.indel.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        # Check that the deletions have position decremented by one from what is present in the maflite
        #  Checking that 1	36643701 in the maflite (a deletion) becomes 1	36643700 in the vcf, but that the others are
        #  the same.
        maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))

        vcf_pos = [int(rec.POS) for rec in vcf_reader]
        for m in muts:
            # If the variant is a deletion, then the vcf position should be the same as maflite minus one.  Otherwise, the same.
            is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".")
            if is_variant_deletion:
                self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start)
            else:
                self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
    def testFullIndelVcf(self):
        """ Perform test of a Indel maflite all the way through TCGA VCF creation
        """
        outputFilename = "out/TCGAVCFTest.indel.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        # Check that the deletions have position decremented by one from what is present in the maflite
        #  Checking that 1	36643701 in the maflite (a deletion) becomes 1	36643700 in the vcf, but that the others are
        #  the same.
        maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))

        vcf_pos = [int(rec.POS) for rec in vcf_reader]
        for m in muts:
            # If the variant is a deletion, then the vcf position should be the same as maflite minus one.  Otherwise, the same.
            is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".")
            if is_variant_deletion:
                self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start)
            else:
                self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
Beispiel #5
0
    def testFullSnpVcf(self):
        """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation.  Only checks that a file was created.
        """
        outputFilename = "out/TCGAVCFTest.snp.vcf"
        callStatsIn = MafliteInputMutationCreator(
            "testdata/Test.call_stats.trim.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))

        maflite_ic = MafliteInputMutationCreator(
            "testdata/maflite/Patient0.indel.maf.txt")
        muts = maflite_ic.createMutations()
        vcf_reader = vcf.Reader(open(outputFilename, 'r'))
        for i, m in enumerate(muts):
            rec = vcf_reader.next()
            qual = rec.QUAL

            # All records should have QUAL with a value (i.e. NOT ".")
            self.assertIsNotNone(qual)
 def test_alt1_vs_alt2(self):
     """Test that we pick up the alternate that is different from the reference when both are specified"""
     ic = MafliteInputMutationCreator("testdata/maflite/alt1_vs_alt2.maflite")
     muts = ic.createMutations()
     ctr = 0
     for m in muts:
         ctr += 1
         self.assertTrue(m.alt_allele == "C", "Did not properly populate the alternate allele in line " + str(ctr) + "  " + m.alt_allele)
 def testChromosomeM(self):
     """ Make sure that the chromosome created as M, rather than MT."""
     tmp = MafliteInputMutationCreator("testdata/maflite/chrM.maf.txt")
     muts = tmp.createMutations()
     for m in muts:
         self.assertTrue(
             m.chr == "M",
             "mitochondria chromosome should be M, not " + m.chr)
 def testSimpleRead(self):
     """ Read a good maflite file and make sure that each mutation validates """
     tmp = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt", 'configs/maflite_input.config')
     muts = tmp.createMutations()
     
     # If no exception is thrown, then this test passes.
     for m in muts:
         MutUtils.validateMutation(m)
 def testSampleNameSelectorWithMaf(self):
     input = MafliteInputMutationCreator("testdata/maflite/tiny_maflite.maf.txt")
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     for mut in input.createMutations():
         self.assertEqual("Patient0-Normal-Patient0-Tumor", s.getSampleName(mut))
     self.assertEqual(s.getAnnotationSource(),"OUTPUT")
     self.assertEqual(s.getOutputAnnotationName(), MutUtils.SAMPLE_NAME_ANNOTATION_NAME)
 def testTCGAMAFAsInput(self):
     """ Test that we can take in a TCGA MAF (using MAFLITE), do no annotations, and still render it properly """
     tmp = MafliteInputMutationCreator("testdata/maf/Patient0.maf.annotated", 'configs/maflite_input.config')
     muts = tmp.createMutations()
     
     outputFilename = "out/testTCGAMAFAsInput.tsv"
     outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config')
     outputRenderer.renderMutations(muts, tmp.getComments())
 def testGetMetadata(self):
     """Make sure that we can retrieve metadata, even before createMutations has been called"""
     ic = MafliteInputMutationCreator("testdata/maflite/tiny_maflite.maf.txt")
     gtKeys = {'build', 'chr', 'start', 'end', 'ref_allele', 'alt_allele', 'tumor_barcode', 'normal_barcode',
               'tumor_f', 'init_t_lod', 't_lod_fstar', 't_alt_count', 't_ref_count', 'judgement'}
     md = ic.getMetadata()
     ks = set(md.keys())
     diff = gtKeys.symmetric_difference(ks)
     self.assertTrue(len(diff) == 0, "Missing keys that should have been seen in the metadata: " + str(diff))
 def testNumberOfMuts(self):
     """ Make sure that the proper number of mutations were generated """
     inputFilename = "testdata/maflite/Patient0.snp.maf.txt"
     tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config')
     muts = tmp.createMutations()
     numMutsInput = len(file(inputFilename,'r').readlines()) - 1
     ctr = 0
     for m in muts:
         ctr += 1
     self.assertEqual(ctr, numMutsInput, "Did not see the proper number of mutations.")
    def testSimpleRead(self):
        """ Read a good maflite file and make sure that each mutation validates """
        tmp = MafliteInputMutationCreator(
            "testdata/maflite/Patient0.indel.maf.txt", None,
            'configs/maflite_input.config')
        muts = tmp.createMutations()

        # If no exception is thrown, then this test passes.
        for m in muts:
            MutUtils.validateMutation(m)
Beispiel #14
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations()    
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(len(unknownAnnotations) == 0, "Unknown annotations exist in mutation: " + str(unknownAnnotations))
 def testSampleNameSelectorWithMaf(self):
     input = MafliteInputMutationCreator(
         "testdata/maflite/tiny_maflite.maf.txt")
     first_mut = next(input.createMutations())
     s = SampleNameSelector(first_mut)
     for mut in input.createMutations():
         self.assertEqual("Patient0-Normal-Patient0-Tumor",
                          s.getSampleName(mut))
     self.assertEqual(s.getAnnotationSource(), "OUTPUT")
     self.assertEqual(s.getOutputAnnotationName(),
                      MutUtils.SAMPLE_NAME_ANNOTATION_NAME)
    def testTCGAMAFAsInput(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do no annotations, and still render it properly """
        tmp = MafliteInputMutationCreator(
            "testdata/maf/Patient0.maf.annotated", None,
            'configs/maflite_input.config')
        muts = tmp.createMutations()

        outputFilename = "out/testTCGAMAFAsInput.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(muts, tmp.getComments())
 def testNumberOfMuts(self):
     """ Make sure that the proper number of mutations were generated """
     inputFilename = "testdata/maflite/Patient0.snp.maf.txt"
     tmp = MafliteInputMutationCreator(inputFilename)
     muts = tmp.createMutations()
     numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
     ctr = 0
     for m in muts:
         ctr += 1
     self.assertEqual(ctr, numMutsInput,
                      "Did not see the proper number of mutations.")
 def test_alt1_vs_alt2(self):
     """Test that we pick up the alternate that is different from the reference when both are specified"""
     ic = MafliteInputMutationCreator(
         "testdata/maflite/alt1_vs_alt2.maflite")
     muts = ic.createMutations()
     ctr = 0
     for m in muts:
         ctr += 1
         self.assertTrue(
             m.alt_allele == "C",
             "Did not properly populate the alternate allele in line " +
             str(ctr) + "  " + m.alt_allele)
Beispiel #19
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator('testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
     mutations = inputCreator.createMutations() 
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(False, "Erroneous exception was thrown: " + str(e) + "\n" + traceback.format_exc())
         self.assertTrue(m['gene'] != '')
Beispiel #20
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()  
        ctr = 0  
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput, "Gaf data source altered mutation count.")
Beispiel #21
0
 def testNoUnknownAnnotations(self):
     """ Make sure that the gaf 3.0 datasource does not annotate anything with source set to Unknown """
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/Patient0.snp.maf.txt')
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         m = gafDatasource.annotate_mutation(m)
         MutUtils.validateMutation(m)
         unknownAnnotations = MutUtils.getUnknownAnnotations(m)
         self.assertTrue(
             len(unknownAnnotations) == 0,
             "Unknown annotations exist in mutation: " +
             str(unknownAnnotations))
Beispiel #22
0
    def testManualAnnotations(self):
        """ Test that the manual annotation facility in the Annotator is working properly. """
        annotator = Annotator()
        overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'}
        annotator.setManualAnnotations(overrides)
        inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
        outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv")
        annotator.setInputCreator(inputCreator)
        annotator.setOutputRenderer(outputRenderer)

        testOutputFilename = annotator.annotate()

        keysOfInterest = overrides.keys()

        statinfo = os.stat(testOutputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.")

        tsvReader = GenericTsvReader(testOutputFilename)

        ctr = 1
        for lineDict in tsvReader:
            for k in keysOfInterest:
                self.assertTrue(lineDict[k] != "__UNKNOWN__",
                                "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] != "",
                                "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] == overrides[k],
                                "Value for " + k + " on line " + str(ctr) + " did not match override: " + str(
                                    lineDict[k]) + " <> " + str(overrides[k]))
            ctr += 1
Beispiel #23
0
    def testBlankAnnotatorInit(self):
        """ Test an extremely simple scenario, where no additional annotations are needed.  I.e. no data sources """
        self.logger.info("Starting Blank Annotator Init Test...")

        inputCreator = MafliteInputMutationCreator(
            'testdata/maflite/tiny_maflite.maf.txt')
        outputRenderer = SimpleOutputRenderer(
            "out/testBlankAnnotatorTestFile.tsv")

        # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively.
        # 1) Initialize the Annotator
        annotator = Annotator()
        annotator.setInputCreator(inputCreator)
        annotator.setOutputRenderer(outputRenderer)
        testOutputFilename = annotator.annotate()

        # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line).
        numSamples = 1
        numExtraLines = 3  # one for header, two for comment lines
        numDoubleLines = 0  # Number of lines with two alt alleles
        numVariants = 9
        gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines
        fp = file(testOutputFilename, 'r')
        ctr = 0
        for line in fp:
            ctr += 1
        fp.close()
        self.assertEqual(
            ctr, gt, "Number of lines read was not correct: " + str(ctr) +
            " -- should have been: " + str(gt))
 def testGetMetadata(self):
     """Make sure that we can retrieve metadata, even before createMutations has been called"""
     ic = MafliteInputMutationCreator(
         "testdata/maflite/tiny_maflite.maf.txt")
     gtKeys = {
         'build', 'chr', 'start', 'end', 'ref_allele', 'alt_allele',
         'tumor_barcode', 'normal_barcode', 'tumor_f', 'init_t_lod',
         't_lod_fstar', 't_alt_count', 't_ref_count', 'judgement'
     }
     md = ic.getMetadata()
     ks = set(md.keys())
     diff = gtKeys.symmetric_difference(ks)
     self.assertTrue(
         len(diff) == 0,
         "Missing keys that should have been seen in the metadata: " +
         str(diff))
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
Beispiel #26
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
Beispiel #27
0
    def testMulticoreAnnotateFromChunkedFile(self):
        #TODO: Add unit test that Mutation data is pickle-able
        inputFile = "testdata/maflite/Patient0.snp.maf.txt"
        outputFile = "out/testGAFMulticorePatient0.snp.maf.txt"
        chunkSize = 200
        numChunks = 4


        gafDatasource = TestUtils.createGafDatasourceProxy(self.config)
        ic = MafliteInputMutationCreator(inputFile)
        oc = SimpleOutputRenderer(outputFile)

        # createChunks
        muts = ic.createMutations()

        allAnnotatedChunksFlat = []
        are_mutations_remaining = True
        p = LoggingPool(processes=numChunks)
        while are_mutations_remaining:

            chunks = []
            for j in xrange(0, numChunks):
                chunk = []
                for i in xrange(0, chunkSize):
                    try:
                        chunk.append(muts.next())
                    except StopIteration:
                        are_mutations_remaining = False
                        break

                chunks.append((chunk, gafDatasource))

            annotatedChunks = p.map(annotate_mutations_global, chunks)
            annotatedChunksFlat = self._flattenChunks(annotatedChunks)
            allAnnotatedChunksFlat.append(annotatedChunksFlat)
        p.close()
        p.join()

        annotatedMuts = chain.from_iterable(allAnnotatedChunksFlat)

        ctr = 0
        oc.renderMutations(annotatedMuts, Metadata())
        tsvReader = GenericTsvReader(outputFile)
        for line in tsvReader:
            ctr += 1
        self.assertTrue(ctr == 730, "Should have read 730 variants, but read " + str(ctr))
Beispiel #28
0
    def testMulticoreAnnotateFromChunkedFile(self):
        #TODO: Add unit test that Mutation data is pickle-able
        inputFile = "testdata/maflite/Patient0.snp.maf.txt"
        outputFile = "out/testGAFMulticorePatient0.snp.maf.txt"
        chunkSize = 200
        numChunks = 4

        gafDatasource = TestUtils.createGafDatasourceProxy(self.config)
        ic = MafliteInputMutationCreator(inputFile)
        oc = SimpleOutputRenderer(outputFile)

        # createChunks
        muts = ic.createMutations()

        allAnnotatedChunksFlat = []
        are_mutations_remaining = True
        p = LoggingPool(processes=numChunks)
        while are_mutations_remaining:

            chunks = []
            for j in xrange(0, numChunks):
                chunk = []
                for i in xrange(0, chunkSize):
                    try:
                        chunk.append(muts.next())
                    except StopIteration:
                        are_mutations_remaining = False
                        break

                chunks.append((chunk, gafDatasource))

            annotatedChunks = p.map(annotate_mutations_global, chunks)
            annotatedChunksFlat = self._flattenChunks(annotatedChunks)
            allAnnotatedChunksFlat.append(annotatedChunksFlat)
        p.close()
        p.join()

        annotatedMuts = chain.from_iterable(allAnnotatedChunksFlat)

        ctr = 0
        oc.renderMutations(annotatedMuts, Metadata())
        tsvReader = GenericTsvReader(outputFile)
        for line in tsvReader:
            ctr += 1
        self.assertTrue(ctr == 730,
                        "Should have read 730 variants, but read " + str(ctr))
Beispiel #29
0
    def testNoLostMutations(self):
        """ Does a simple gaf datasource annotation run and makes sure that no mutations were lost """
        inputFilename = 'testdata/maflite/Patient0.snp.maf.txt'
        inputCreator = MafliteInputMutationCreator(
            inputFilename, "configs/maflite_input.config")
        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)

        numMutsInput = len(file(inputFilename, 'r').readlines()) - 1
        mutations = inputCreator.createMutations()
        ctr = 0
        for m in mutations:
            m = gafDatasource.annotate_mutation(m)
            MutUtils.validateMutation(m)
            ctr += 1
        self.assertEqual(ctr, numMutsInput,
                         "Gaf data source altered mutation count.")
Beispiel #30
0
 def testChrGLs(self):
     """ Test that mutations on unaligned transcripts can be annotated properly.  I.e. when chromosome = GL....."""
     inputCreator = MafliteInputMutationCreator(
         'testdata/maflite/chrGLs.maf.tsv', "configs/maflite_input.config")
     gafDatasource = TestUtils.createTranscriptProviderDatasource(
         self.config)
     mutations = inputCreator.createMutations()
     for m in mutations:
         try:
             m = gafDatasource.annotate_mutation(m)
             MutUtils.validateMutation(m)
         except Exception as e:
             # Fail this test because an exception was thrown
             self.assertTrue(
                 False, "Erroneous exception was thrown: " + str(e) + "\n" +
                 traceback.format_exc())
         self.assertTrue(m['gene'] != '')
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
Beispiel #32
0
 def test_multi_sample_maflite(self):
     """Tests a multi sample maf with several unaligned onps"""
     input = OnpCombiner(MafliteInputMutationCreator('testdata/maflite/onp_combination.maf.txt'))
     onp_muts =list(input.createMutations())
     expected = self._tuples_to_MutationData([(1, 1, 1, "G", "A", "hg19"),
                                              (1, 1, 4, "GTTT", "CGAA", "hg19"),
                                              (1, 2, 3, "TT", "CC", "hg19"),
                                              (1, 12, 12, "T", "G", "hg19"),
                                              (2, 13, 13, "A", "G", "hg19")])
     expected_pair_names = ['P2','P1','P3','P1','P1']
     self._assert_mutation_lists_equal(onp_muts, expected)
     for mut, pair in zip(onp_muts, expected_pair_names):
         self.assertEqual(mut["Pair_Name"], pair)
    def testMafInput(self):
        """Make sure that we can render a TCGA VCF from a TCGA MAF -- using no datasources"""
        inputFile = "testdata/maf/Patient1.snp.maf.annotated"
        outputFilename = "out/maf2tcgavcf.vcf"
        mafIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)

        annotator = Annotator()
        annotator.setInputCreator(mafIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        annotator.annotate()
        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
 def testMissingRequiredHeaders(self):
     try:
         tmp = MafliteInputMutationCreator(
             "testdata/maflite/brokenMaflite.tsv", None,
             'configs/maflite_input.config')
         self.assertFalse(True, " Exception was not thrown")
     except MafliteMissingRequiredHeaderException as e:
         #str(e).find('alt_allele,end,ref_allele')<> -1
         missingCols = ['alt_allele', 'end', 'ref_allele']
         isMissingInData = []
         for c in missingCols:
             isMissingInData.append(str(e).find(c) <> -1)
         self.assertTrue(
             all(isMissingInData),
             "Incorrect columns identified as missing in maflite check: " +
             str(e))
    def testFullSnpVcf(self):
        """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation.  Only checks that a file was created.
        """
        outputFilename = "out/TCGAVCFTest.snp.vcf"
        callStatsIn = MafliteInputMutationCreator("testdata/Test.call_stats.trim.txt")
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
Beispiel #36
0
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename,
                                          'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()

        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Who) from header")

        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(
            ctrOut == (ctrIn + 2),
            "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): "
            + str(ctrIn) + ", " + str(ctrOut))
    def testDoubleAnnotationError(self):
        ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. '''
        outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/testDoubleAnnotate.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
    def testAnotherFullSNP(self):
        """Test SNP call stats .  Just make sure no exception is thrown."""
        inputFile = "testdata/maflite/Another.call_stats.txt"
        outputFilename = "out/Another.call_stats.out.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
    def testEmptyInput(self):
        """Make sure that we can generate an empty vcf from an empty maflite"""
        inputFile = "testdata/maflite/empty.maflite"
        outputFilename = "out/empty.vcf"
        callStatsIn = MafliteInputMutationCreator(inputFile)
        vcfOR = TcgaVcfOutputRenderer(outputFilename)
        datasources = self._createDatasourcesForTesting()

        annotator = Annotator()
        annotator.setInputCreator(callStatsIn)
        annotator.setOutputRenderer(vcfOR)
        annotator.setManualAnnotations(self._createManualAnnotations())
        for ds in datasources:
            annotator.addDatasource(ds)
        annotator.annotate()

        self.assertTrue(os.path.exists(outputFilename))
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
    def test_simple_seg_file_annotations(self):
        """Test that we can read in a seg file, do GENCODE annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_simple_seg_file_annotations.tsv"
        if os.path.exists(output_filename):
            os.remove(output_filename)
        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        i = 1
        for i, seg in enumerate(segs):
            pass

        self.assertTrue(
            (i + 1) == 27,
            "Found %d segments when there should have been 27." % (i + 1))

        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        gencode_ds = TestUtils._create_test_gencode_v19_ds(
            "out/seg_file_gencode_ds")
        annotator = Annotator()

        segs_annotated = []
        for seg in segs:
            segs_annotated.append(gencode_ds.annotate_segment(seg))

        outputRenderer = SimpleOutputRenderer(output_filename, '')
        outputRenderer.renderMutations(segs_annotated.__iter__())

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv of Genomic positions.  This tests both single- and multiple-nucleotide variants.  The tsv is already installed (i.e. proper config file created).
        '''
        outputFilename = 'out/genericGenomePositionTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/tiny_maflite.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 1
        # Two overlap, one does not.  Repeat...
        for lineDict in tsvReader:
            if (ctr % 3 == 0):
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should have had blank value, but did not: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            else:
                self.assertFalse(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should not have had blank value, but did.")
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034',
                    "Line " + str(ctr) + " did not have correct value: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            ctr = ctr + 1
 def testChromosomeM(self):
     """ Make sure that the chromosome created as M, rather than MT."""
     tmp = MafliteInputMutationCreator("testdata/maflite/chrM.maf.txt", 'configs/maflite_input.config')
     muts = tmp.createMutations()
     for m in muts:
         self.assertTrue(m.chr=="M", "mitochondria chromosome should be M, not " + m.chr)