def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def testBasicAnnotation(self): ''' Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. ''' # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") outputFilename = 'out/genericTranscriptTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue( "refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue( "refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def testFullIndelVcf(self): """ Perform test of a Indel maflite all the way through TCGA VCF creation """ outputFilename = "out/TCGAVCFTest.indel.vcf" callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) # Check that the deletions have position decremented by one from what is present in the maflite # Checking that 1 36643701 in the maflite (a deletion) becomes 1 36643700 in the vcf, but that the others are # the same. maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) vcf_pos = [int(rec.POS) for rec in vcf_reader] for m in muts: # If the variant is a deletion, then the vcf position should be the same as maflite minus one. Otherwise, the same. is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".") if is_variant_deletion: self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start) else: self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testBlankAnnotatorInit(self): """ Test an extremely simple scenario, where no additional annotations are needed. I.e. no data sources """ self.logger.info("Starting Blank Annotator Init Test...") inputCreator = MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt') outputRenderer = SimpleOutputRenderer("out/testBlankAnnotatorTestFile.tsv") # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line). numSamples = 1 numExtraLines = 3 # one for header, two for comment lines numDoubleLines = 0 # Number of lines with two alt alleles numVariants = 9 gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines fp = file(testOutputFilename, 'r') ctr = 0 for line in fp: ctr += 1 fp.close() self.assertEqual(ctr, gt, "Number of lines read was not correct: " + str(ctr) + " -- should have been: " + str(gt))
def testManualAnnotations(self): """ Test that the manual annotation facility in the Annotator is working properly. """ annotator = Annotator() overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'} annotator.setManualAnnotations(overrides) inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt') outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv") annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() keysOfInterest = overrides.keys() statinfo = os.stat(testOutputFilename) self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.") tsvReader = GenericTsvReader(testOutputFilename) ctr = 1 for lineDict in tsvReader: for k in keysOfInterest: self.assertTrue(lineDict[k] != "__UNKNOWN__", "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] != "", "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] == overrides[k], "Value for " + k + " on line " + str(ctr) + " did not match override: " + str( lineDict[k]) + " <> " + str(overrides[k])) ctr += 1
def testSNPsAndIndelStartAndEndPos(self): """ Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification). """ inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"]) outputFilename = os.path.join("out", "example.snps.indels.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) for row in tsvReader: if row['start'] == "16890445": self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445", row["end"])) elif row["start"] == "154524458": self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459", row["end"])) elif row["start"] == "114189432": self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433", row["end"]))
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testDuplicateAnnotation(self): """ Tests that the duplicate annotations are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"]) outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) fieldnames = tsvReader.getFieldNames() self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.") self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.") row = tsvReader.next() self.assertTrue("variant_status" in row, "variant_status field is missing in the row.") self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.") self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.") self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
def testFullSnpVcf(self): """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation. Only checks that a file was created. """ outputFilename = "out/TCGAVCFTest.snp.vcf" callStatsIn = MafliteInputMutationCreator( "testdata/Test.call_stats.trim.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) maflite_ic = MafliteInputMutationCreator( "testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) for i, m in enumerate(muts): rec = vcf_reader.next() qual = rec.QUAL # All records should have QUAL with a value (i.e. NOT ".") self.assertIsNotNone(qual)
def testBlankAnnotatorInit(self): """ Test an extremely simple scenario, where no additional annotations are needed. I.e. no data sources """ self.logger.info("Starting Blank Annotator Init Test...") inputCreator = MafliteInputMutationCreator( 'testdata/maflite/tiny_maflite.maf.txt') outputRenderer = SimpleOutputRenderer( "out/testBlankAnnotatorTestFile.tsv") # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line). numSamples = 1 numExtraLines = 3 # one for header, two for comment lines numDoubleLines = 0 # Number of lines with two alt alleles numVariants = 9 gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines fp = file(testOutputFilename, 'r') ctr = 0 for line in fp: ctr += 1 fp.close() self.assertEqual( ctr, gt, "Number of lines read was not correct: " + str(ctr) + " -- should have been: " + str(gt))
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ 'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31', 'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC', 'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1', 'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B' ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( 'CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue( 'CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue( lineDict['CGC_Abridged_GeneID'] != '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def testNumberGRenderingOfRandomVcf(self): inputFilename = os.path.join(*["testdata", "vcf", "number_g.random.vcf"]) outputFilename = os.path.join("out", "number_g.random.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testAnnotationWithNoSampleNameExampleVcf(self): """ Tests whether parsed annotations match the actual annotations when the input is a VCF file that has no samples. """ inputFilename = os.path.join(*["testdata", "vcf", "example.sampleName.removed.vcf"]) outputFilename = os.path.join("out", "example.sampleName.removed.out.tsv") creator = VcfInputMutationCreator(inputFilename) renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testSwitchedFieldsWithExampleVcf(self): """ Tests whether the switched tags are ignored. """ inputFilename = os.path.join(*["testdata", "vcf", "example.bad.switched.fields.vcf"]) outputFilename = os.path.join("out", "example.switched.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer)
def testAnnotationWithDuplicateValuesInVcf(self): """ Tests the ability to parse a VCF that contains an INFO, FILTER, and INFO field with the same name. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_fields.vcf"]) outputFilename = os.path.join("out", "example.duplicate_fields2.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testSimpleAnnotationWithAComplexVcf(self): """ Tests the ability to parse a rather complex VCF file without any errors. """ inputFilename = os.path.join(*["testdata", "vcf", "random.vcf"]) outputFilename = os.path.join("out", "random.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testMafInput(self): """Make sure that we can render a TCGA VCF from a TCGA MAF -- using no datasources""" inputFile = "testdata/maf/Patient1.snp.maf.annotated" outputFilename = "out/maf2tcgavcf.vcf" mafIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(mafIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def testSimpleAnnotationWithExampleVcf(self): """ Tests the ability to do a simple Gaf 3.0 annotation. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "simpleVCF.Gaf.annotated.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config)) annotator.annotate()
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join( *["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName)
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testFullSnpVcf(self): """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation. Only checks that a file was created. """ outputFilename = "out/TCGAVCFTest.snp.vcf" callStatsIn = MafliteInputMutationCreator("testdata/Test.call_stats.trim.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename))
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer( outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue( ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testDoubleAnnotationError(self): ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. ''' outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv' gpDS = DatasourceFactory.createDatasource( "testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/testDoubleAnnotate.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename))
def testAnotherFullSNP(self): """Test SNP call stats . Just make sure no exception is thrown.""" inputFile = "testdata/maflite/Another.call_stats.txt" outputFilename = "out/Another.call_stats.out.vcf" callStatsIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def testEmptyInput(self): """Make sure that we can generate an empty vcf from an empty maflite""" inputFile = "testdata/maflite/empty.maflite" outputFilename = "out/empty.vcf" callStatsIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def testTCGAMAFRendering(self): """ Tests the ability to render a germline VCF file as a TCGA MAF file. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.vcf.maf.annotated") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = TcgaMafOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.setManualAnnotations(self._createTCGAMAFOverridesForVCF()) datasources = self._createDatasourceCorpus() for ds in datasources: annotator.addDatasource(ds) filename = annotator.annotate() self._validateTcgaMafContents(filename)
def testBasicAnnotation(self): """ Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. """ # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) outputFilename = "out/genericTranscriptTest.out.tsv" annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt")) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource( "testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue( lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse( lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue( lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName)
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B'] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config) geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1 ctr = ctr + 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def testBasicAnnotation(self): """ Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. """ # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ "ABL1", "ABL2", "ACSL3", "AF15Q14", "AF1Q", "AF3p21", "AF5q31", "AKAP9", "AKT1", "AKT2", "ALDH2", "ALK", "ALO17", "APC", "ARHGEF12", "ARHH", "ARID1A", "ARID2", "ARNT", "ASPSCR1", "ASXL1", "ATF1", "ATIC", "ATM", "ATRX", "BAP1", "BCL10", "BCL11A", "BCL11B", ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/" ) outputFilename = "out/genericGeneTest.out.tsv" annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt")) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( "CGC_Abridged_Other Syndrome/Disease" in fields, "'CGC_Other Syndrome/Disease' was not present in the header", ) self.assertTrue( "CGC_Abridged_Mutation Type" in fields, "'CGC_Abridged_Mutation Type' was not present in the header" ) ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue("gene" in lineDict.keys()) if lineDict["gene"] in genesAvailable: self.assertTrue( lineDict["CGC_Abridged_GeneID"] != "", "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr), ) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")