def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def test_basic_rendering(self): """Test that we can render a basic seg file as a gene list""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_basic_rendering.gene_list.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") self.assertTrue(line_dict['segment_end'] is not None) self.assertTrue(line_dict['segment_end'].strip() != "") self.assertTrue("gene" in line_dict.keys()) self.assertTrue(len(line_dict["gene"]) > 0) self.assertTrue(float(line_dict["segment_num_probes"])) self.assertTrue(line_dict['sample'] == "Patient0")
def test_full_seg_file_annotations(self): """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_full_seg_file_annotations.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) required_cols = ["Sample", "Num_Probes", "Segment_Mean"] headers = output_reader.getFieldNames() for rcol in required_cols: self.assertTrue(rcol in headers) for line_dict in output_reader: self.assertTrue(line_dict['start'] is not None) self.assertTrue(line_dict['start'].strip() != "") self.assertTrue(line_dict['end'] is not None) self.assertTrue(line_dict['end'].strip() != "") self.assertTrue("genes" in line_dict.keys()) self.assertTrue(len(line_dict["genes"].split(",")) > 0)
def testFullIndelVcf(self): """ Perform test of a Indel maflite all the way through TCGA VCF creation """ outputFilename = "out/TCGAVCFTest.indel.vcf" callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) # Check that the deletions have position decremented by one from what is present in the maflite # Checking that 1 36643701 in the maflite (a deletion) becomes 1 36643700 in the vcf, but that the others are # the same. maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) vcf_pos = [int(rec.POS) for rec in vcf_reader] for m in muts: # If the variant is a deletion, then the vcf position should be the same as maflite minus one. Otherwise, the same. is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".") if is_variant_deletion: self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start) else: self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
def testSNPsAndIndelStartAndEndPos(self): """ Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification). """ inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"]) outputFilename = os.path.join("out", "example.snps.indels.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) for row in tsvReader: if row['start'] == "16890445": self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445", row["end"])) elif row["start"] == "154524458": self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459", row["end"])) elif row["start"] == "114189432": self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433", row["end"]))
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
def test_full_seg_file_annotations(self): """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_full_seg_file_annotations.tsv" db_dir = self.config.get('DEFAULT',"dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) required_cols = ["Sample", "Num_Probes", "Segment_Mean"] headers = output_reader.getFieldNames() for rcol in required_cols: self.assertTrue(rcol in headers) for line_dict in output_reader: self.assertTrue(line_dict['start'] is not None) self.assertTrue(line_dict['start'].strip() != "") self.assertTrue(line_dict['end'] is not None) self.assertTrue(line_dict['end'].strip() != "") self.assertTrue("genes" in line_dict.keys()) self.assertTrue(len(line_dict["genes"].split(",")) > 0)
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT', "dbDir") inputFilename = os.path.join( *["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) outputFilename = os.path.join( "out", "example.trailing_whitespace_in_alleles.vcf") annotator = Annotator() from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS, other_opts={'vcf_out_infer_genotypes': False}) annotator.initialize(run_spec) annotator.annotate() #check output vcf_data = open(outputFilename).read() self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data) self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data) self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data) self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
def test_rendering_with_exons(self): """Test that we can render a seg file that includes exons at end points""" inputFilename = "testdata/seg/Middle_of_exon.seg.txt" output_filename = "out/test_exon_seg2.gene_list.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") if line_dict['segment_end_gene'] == "MAPK1": self.assertTrue( line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
def testDuplicateAnnotation(self): """ Tests that the duplicate annotations are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"]) outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) fieldnames = tsvReader.getFieldNames() self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.") self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.") row = tsvReader.next() self.assertTrue("variant_status" in row, "variant_status field is missing in the row.") self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.") self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.") self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
def test_basic_rendering(self): """Test that we can render a basic seg file as a gene list""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_basic_rendering.gene_list.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") self.assertTrue(line_dict['segment_end'] is not None) self.assertTrue(line_dict['segment_end'].strip() != "") self.assertTrue("gene" in line_dict.keys()) self.assertTrue(len(line_dict["gene"]) > 0) self.assertTrue(float(line_dict["segment_num_probes"])) self.assertTrue(line_dict['sample'] == "Patient0")
def testFullSnpVcf(self): """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation. Only checks that a file was created. """ outputFilename = "out/TCGAVCFTest.snp.vcf" callStatsIn = MafliteInputMutationCreator( "testdata/Test.call_stats.trim.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) maflite_ic = MafliteInputMutationCreator( "testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) for i, m in enumerate(muts): rec = vcf_reader.next() qual = rec.QUAL # All records should have QUAL with a value (i.e. NOT ".") self.assertIsNotNone(qual)
def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file): # For this conversion, you must specify the barcodes manually override_annotations = dict() override_annotations.update({ 'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal' }) other_opts = { OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True, OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True } # Use an empty datasource dir in order to speed this up. annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file, datasource_dir=".", global_annotations=override_annotations, is_skip_no_alts=True, other_opts=other_opts) annotator.initialize(runSpec) annotator.annotate()
def test_rendering_combined_to_tsv(self): """Test that we produce a merged ONP simple tsv file without crashing """ input_filename = os.path.join(*["testdata", "maflite", "onp_combination.maf.txt"]) output_filename = os.path.join("out", "onp_combination.tsv") spec = RunSpecificationFactory.create_run_spec("MAFLITE","SIMPLE_TSV",input_filename, output_filename, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ 'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31', 'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC', 'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1', 'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B' ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( 'CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue( 'CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue( lineDict['CGC_Abridged_GeneID'] != '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def test_single_sample_onp_combiner(self): """test that we can create an onp combined TCGA maf without crashing""" input_filename = 'testdata/maflite/onp.singlesample.maf.txt' output_filename = 'out/testSingleSampleOnpCombiner.maf' config = TestUtils.createUnitTestConfig() defaultdb = config.get('DEFAULT',"dbDir") spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename,datasourceDir=defaultdb, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def testNumberGRenderingOfRandomVcf(self): inputFilename = os.path.join(*["testdata", "vcf", "number_g.random.vcf"]) outputFilename = os.path.join("out", "number_g.random.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def test_single_sample_onp_combiner(self): """test that we can create an onp combined TCGA maf without crashing""" input_filename = 'testdata/maflite/onp.singlesample.maf.txt' output_filename = 'out/testSingleSampleOnpCombiner.maf' config = TestUtils.createUnitTestConfig() defaultdb = config.get('DEFAULT',"dbDir") spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename, datasource_dir=defaultdb, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file): # For this conversion, you must specify the barcodes manually override_annotations = dict() override_annotations.update({'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal'}) other_opts = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True, OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True} # Use an empty datasource dir in order to speed this up. annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file, datasource_dir=".", global_annotations=override_annotations, is_skip_no_alts=True, other_opts=other_opts) annotator.initialize(runSpec) annotator.annotate()
def testAnnotationWithNoSampleNameExampleVcf(self): """ Tests whether parsed annotations match the actual annotations when the input is a VCF file that has no samples. """ inputFilename = os.path.join(*["testdata", "vcf", "example.sampleName.removed.vcf"]) outputFilename = os.path.join("out", "example.sampleName.removed.out.tsv") creator = VcfInputMutationCreator(inputFilename) renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a maflite file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT',"dbDir") inputFilename = os.path.join(*["testdata", "maflite", "example.trailing_whitespace_in_alleles.maflite"]) outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.maf.txt") annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS) annotator.initialize(run_spec) annotator.annotate()
def testSimpleAnnotationWithAComplexVcf(self): """ Tests the ability to parse a rather complex VCF file without any errors. """ inputFilename = os.path.join(*["testdata", "vcf", "random.vcf"]) outputFilename = os.path.join("out", "random.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testAnnotationWithDuplicateValuesInVcf(self): """ Tests the ability to parse a VCF that contains an INFO, FILTER, and INFO field with the same name. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_fields.vcf"]) outputFilename = os.path.join("out", "example.duplicate_fields2.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testMafInput(self): """Make sure that we can render a TCGA VCF from a TCGA MAF -- using no datasources""" inputFile = "testdata/maf/Patient1.snp.maf.annotated" outputFilename = "out/maf2tcgavcf.vcf" mafIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(mafIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def testSimpleAnnotationWithExampleVcf(self): """ Tests the ability to do a simple Gaf 3.0 annotation. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "simpleVCF.Gaf.annotated.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config)) annotator.annotate()
def testBlankAnnotatorInit(self): """ Test an extremely simple scenario, where no additional annotations are needed. I.e. no data sources """ self.logger.info("Starting Blank Annotator Init Test...") inputCreator = MafliteInputMutationCreator( 'testdata/maflite/tiny_maflite.maf.txt') outputRenderer = SimpleOutputRenderer( "out/testBlankAnnotatorTestFile.tsv") # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line). numSamples = 1 numExtraLines = 3 # one for header, two for comment lines numDoubleLines = 0 # Number of lines with two alt alleles numVariants = 9 gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines fp = file(testOutputFilename, 'r') ctr = 0 for line in fp: ctr += 1 fp.close() self.assertEqual( ctr, gt, "Number of lines read was not correct: " + str(ctr) + " -- should have been: " + str(gt))
def testAnnotationRoundTripEmpty(self): """Read a VCF, annotate it with no datasources, write it, and read it again without changes""" inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) outputFilename = os.path.join("out", "test_round_trip_empty_annotated.vcf") other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir="THIS_DIR_DOES_NOT_EXIST__", genomeBuild="hg19", other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotated_filename = annotator.annotate() vcf_input2 = VcfInputMutationCreator( annotated_filename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)
def testBasicAnnotation(self): ''' Annotate from a basic tsv of Genomic positions. This tests both single- and multiple-nucleotide variants. The tsv is already installed (i.e. proper config file created). ''' outputFilename = 'out/genericGenomePositionTest.out.tsv' gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gpDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 1 # Two overlap, one does not. Repeat... for lineDict in tsvReader: if (ctr % 3 == 0): self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"]) else: self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.") self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"]) ctr = ctr + 1
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testBasicAnnotation(self): ''' Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. ''' # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") outputFilename = 'out/genericTranscriptTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue( "refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue( "refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def testBlankAnnotatorInit(self): """ Test an extremely simple scenario, where no additional annotations are needed. I.e. no data sources """ self.logger.info("Starting Blank Annotator Init Test...") inputCreator = MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt') outputRenderer = SimpleOutputRenderer("out/testBlankAnnotatorTestFile.tsv") # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line). numSamples = 1 numExtraLines = 3 # one for header, two for comment lines numDoubleLines = 0 # Number of lines with two alt alleles numVariants = 9 gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines fp = file(testOutputFilename, 'r') ctr = 0 for line in fp: ctr += 1 fp.close() self.assertEqual(ctr, gt, "Number of lines read was not correct: " + str(ctr) + " -- should have been: " + str(gt))
def testManualAnnotations(self): """ Test that the manual annotation facility in the Annotator is working properly. """ annotator = Annotator() overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'} annotator.setManualAnnotations(overrides) inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt') outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv") annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() keysOfInterest = overrides.keys() statinfo = os.stat(testOutputFilename) self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.") tsvReader = GenericTsvReader(testOutputFilename) ctr = 1 for lineDict in tsvReader: for k in keysOfInterest: self.assertTrue(lineDict[k] != "__UNKNOWN__", "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] != "", "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] == overrides[k], "Value for " + k + " on line " + str(ctr) + " did not match override: " + str( lineDict[k]) + " <> " + str(overrides[k])) ctr += 1
def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False, other_opts=None): self.logger.info("Initializing Annotator...") if override_annotations is None: override_annotations = dict() if other_opts is None: other_opts = dict() annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( inputFormat, outputFormat, inputFilename, outputFilename, default_annotations=default_annotations, datasource_dir=datasource_dir, global_annotations=override_annotations, is_skip_no_alts=is_skip_no_alts, other_opts=other_opts) annotator.initialize(runSpec) self.logger.info("Annotation starting...") return annotator.annotate()
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join( *["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName)
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testFullSnpVcf(self): """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation. Only checks that a file was created. """ outputFilename = "out/TCGAVCFTest.snp.vcf" callStatsIn = MafliteInputMutationCreator("testdata/Test.call_stats.trim.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename))
def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self): """Test FILTER col is properly rendered when using the collapse-filter-cols option.""" input_fname = 'testdata/vcf/example.vcf' output_fname = 'out/example.one_filter_col.maf.txt' annotator = Annotator() other_opts = {'collapse_filter_cols': True} run_spec = RunSpecificationFactory.create_run_spec( 'VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts) annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_fname) for line_dict in tsv_reader: self.assertIn('i_filter', line_dict) self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
def test_annotating_uniprot_test_file(self): """Test variants with known issues with older version of UniProt datasource. This test will fail if using older version of uniprot datasource (pre-2014) """ db_dir = TestUtils.createUnitTestConfig().get('DEFAULT',"dbDir") annotator = Annotator() out_file_name = "out/uniprot_recovery.maf.annotated" runSpec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", "testdata/maflite/uniprot_recovery.maflite", out_file_name, datasource_dir=db_dir, tx_mode=TranscriptProvider.TX_MODE_BEST_EFFECT) annotator.initialize(runSpec) annotator.annotate() out_file_reader = GenericTsvReader(out_file_name) for i,line_dict in enumerate(out_file_reader): self.assertTrue(line_dict['UniProt_AApos'] != "0") #TODO: The fourth entry is currently not picking up the uniprot entry for this. Remove the "if" statement once issue #253 is addressed if i != 4: self.assertTrue(line_dict['SwissProt_entry_Id'].endswith("HUMAN"))
def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self): """Test FILTER col is properly rendered when using the collapse-filter-cols option.""" input_fname = 'testdata/vcf/example.vcf' output_fname = 'out/example.one_filter_col.maf.txt' annotator = Annotator() other_opts = {'collapse_filter_cols': True} from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec('VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts) annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_fname) for line_dict in tsv_reader: self.assertIn('i_filter', line_dict) self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer( outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue( ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def testAnotherFullSNP(self): """Test SNP call stats . Just make sure no exception is thrown.""" inputFile = "testdata/maflite/Another.call_stats.txt" outputFilename = "out/Another.call_stats.out.vcf" callStatsIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def testEmptyInput(self): """Make sure that we can generate an empty vcf from an empty maflite""" inputFile = "testdata/maflite/empty.maflite" outputFilename = "out/empty.vcf" callStatsIn = MafliteInputMutationCreator(inputFile) vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated VCF file (" + outputFilename + ") is empty.")
def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False): self.logger.info("Initializing Annotator...") if override_annotations is None: override_annotations = dict() annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename, defaultAnnotations=default_annotations, datasourceDir=datasource_dir, globalAnnotations=override_annotations, is_skip_no_alts=is_skip_no_alts) annotator.initialize(runSpec) self.logger.info("Annotation starting...") return annotator.annotate()
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT',"dbDir") inputFilename = os.path.join(*["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.vcf") annotator = Annotator() from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec("VCF", "VCF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS, other_opts={'vcf_out_infer_genotypes': False}) annotator.initialize(run_spec) annotator.annotate() #check output vcf_data = open(outputFilename).read() self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data) self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data) self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data) self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName)