def testManualAnnotations(self): """ Test that the manual annotation facility in the Annotator is working properly. """ annotator = Annotator() overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'} annotator.setManualAnnotations(overrides) inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt') outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv") annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() keysOfInterest = overrides.keys() statinfo = os.stat(testOutputFilename) self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.") tsvReader = GenericTsvReader(testOutputFilename) ctr = 1 for lineDict in tsvReader: for k in keysOfInterest: self.assertTrue(lineDict[k] != "__UNKNOWN__", "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] != "", "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k) self.assertTrue(lineDict[k] == overrides[k], "Value for " + k + " on line " + str(ctr) + " did not match override: " + str( lineDict[k]) + " <> " + str(overrides[k])) ctr += 1
def testAnnotationWithMafliteWithTrailingSpaces(self): """ Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles. """ db_dir = self.config.get('DEFAULT', "dbDir") inputFilename = os.path.join( *["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) outputFilename = os.path.join( "out", "example.trailing_whitespace_in_alleles.vcf") annotator = Annotator() from oncotator.utils.RunSpecification import RunSpecification run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS, other_opts={'vcf_out_infer_genotypes': False}) annotator.initialize(run_spec) annotator.annotate() #check output vcf_data = open(outputFilename).read() self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data) self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data) self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data) self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
def test_full_seg_file_annotations(self): """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_full_seg_file_annotations.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename, datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) required_cols = ["Sample", "Num_Probes", "Segment_Mean"] headers = output_reader.getFieldNames() for rcol in required_cols: self.assertTrue(rcol in headers) for line_dict in output_reader: self.assertTrue(line_dict['start'] is not None) self.assertTrue(line_dict['start'].strip() != "") self.assertTrue(line_dict['end'] is not None) self.assertTrue(line_dict['end'].strip() != "") self.assertTrue("genes" in line_dict.keys()) self.assertTrue(len(line_dict["genes"].split(",")) > 0)
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testFullSnpVcf(self): """ Perform test of a SNP call stats (maflite) all the way through TCGA VCF creation. Only checks that a file was created. """ outputFilename = "out/TCGAVCFTest.snp.vcf" callStatsIn = MafliteInputMutationCreator( "testdata/Test.call_stats.trim.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) maflite_ic = MafliteInputMutationCreator( "testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) for i, m in enumerate(muts): rec = vcf_reader.next() qual = rec.QUAL # All records should have QUAL with a value (i.e. NOT ".") self.assertIsNotNone(qual)
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
def testCreationAndAnnotation(self): """ Test the datasource creation and then do a simple annotation """ outputFilename = 'out/genericGeneProteinPositionTest.out.tsv' gafDS = TestUtils.createTranscriptProviderDatasource(self.config) gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/") annotator = Annotator() annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDS) annotator.addDatasource(gppDS) testFilename = annotator.annotate() # Make sure that some values were populated self.assertTrue(os.path.exists(testFilename)) tsvReader = GenericTsvReader(testFilename) ctr = 0 for lineDict in tsvReader: colName = "UniProt_NatVar_natural_variations" self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName]) ctr += 1 self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
def testBasicAnnotation(self): ''' Test annotation from a generic TSV based on a transcript annotation. Only confirms the proper headers of the output. ''' # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") outputFilename = 'out/genericTranscriptTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(transcriptDS) outputFilename = annotator.annotate() tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue( "refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers)) self.assertTrue( "refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
def testFullIndelVcf(self): """ Perform test of a Indel maflite all the way through TCGA VCF creation """ outputFilename = "out/TCGAVCFTest.indel.vcf" callStatsIn = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") vcfOR = TcgaVcfOutputRenderer(outputFilename) datasources = self._createDatasourcesForTesting() annotator = Annotator() annotator.setInputCreator(callStatsIn) annotator.setOutputRenderer(vcfOR) annotator.setManualAnnotations(self._createManualAnnotations()) for ds in datasources: annotator.addDatasource(ds) annotator.annotate() self.assertTrue(os.path.exists(outputFilename)) # Check that the deletions have position decremented by one from what is present in the maflite # Checking that 1 36643701 in the maflite (a deletion) becomes 1 36643700 in the vcf, but that the others are # the same. maflite_ic = MafliteInputMutationCreator("testdata/maflite/Patient0.indel.maf.txt") muts = maflite_ic.createMutations() vcf_reader = vcf.Reader(open(outputFilename, 'r')) vcf_pos = [int(rec.POS) for rec in vcf_reader] for m in muts: # If the variant is a deletion, then the vcf position should be the same as maflite minus one. Otherwise, the same. is_variant_deletion = (m.alt_allele == "") or (m.alt_allele == "-") or (m.alt_allele == ".") if is_variant_deletion: self.assertTrue((int(m.start) - 1) in vcf_pos, "Deletion was not correct for " + m.chr + ":" + m.start) else: self.assertTrue(int(m.start) in vcf_pos, "Insertion was not correct for " + m.chr + ":" + m.start)
def test_rendering_with_exons(self): """Test that we can render a seg file that includes exons at end points""" inputFilename = "testdata/seg/Middle_of_exon.seg.txt" output_filename = "out/test_exon_seg2.gene_list.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") if line_dict['segment_end_gene'] == "MAPK1": self.assertTrue( line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
def test_basic_rendering(self): """Test that we can render a basic seg file as a gene list""" inputFilename = "testdata/seg/Patient0.seg.txt" output_filename = "out/test_basic_rendering.gene_list.tsv" db_dir = self.config.get('DEFAULT', "dbDir") if os.path.exists(output_filename): os.remove(output_filename) annotator = Annotator() run_spec = RunSpecificationFactory.create_run_spec( "SEG_FILE", "GENE_LIST", inputFilename, output_filename, datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS) annotator.initialize(run_spec) annotator.annotate() # Now check the output output_reader = GenericTsvReader(output_filename) headers = output_reader.getFieldNames() for line_dict in output_reader: self.assertTrue(line_dict['segment_start'] is not None) self.assertTrue(line_dict['segment_start'].strip() != "") self.assertTrue(line_dict['segment_end'] is not None) self.assertTrue(line_dict['segment_end'].strip() != "") self.assertTrue("gene" in line_dict.keys()) self.assertTrue(len(line_dict["gene"]) > 0) self.assertTrue(float(line_dict["segment_num_probes"])) self.assertTrue(line_dict['sample'] == "Patient0")
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def testAnnotationRoundTripEmpty(self): """Read a VCF, annotate it with no datasources, write it, and read it again without changes""" inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) outputFilename = os.path.join("out", "test_round_trip_empty_annotated.vcf") other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir="THIS_DIR_DOES_NOT_EXIST__", genomeBuild="hg19", other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotated_filename = annotator.annotate() vcf_input2 = VcfInputMutationCreator( annotated_filename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)
def testDuplicateAnnotation(self): """ Tests that the duplicate annotations are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"]) outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) fieldnames = tsvReader.getFieldNames() self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.") self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.") row = tsvReader.next() self.assertTrue("variant_status" in row, "variant_status field is missing in the row.") self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.") self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.") self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
def testSNPsAndIndelStartAndEndPos(self): """ Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification). """ inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"]) outputFilename = os.path.join("out", "example.snps.indels.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) for row in tsvReader: if row['start'] == "16890445": self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445", row["end"])) elif row["start"] == "154524458": self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459", row["end"])) elif row["start"] == "114189432": self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433", row["end"]))
def _simple_annotate(self, is_skip_no_alts): runSpec = RunSpecification() runSpec.initialize(None, None, datasources=[], is_skip_no_alts=is_skip_no_alts) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" m.createAnnotation("alt_allele_seen", "False") m2 = MutationData() m2.chr = "1" m2.start = "12941796" m2.end = "12941796" m2.alt_allele = "G" m2.ref_allele = "T" muts = [m, m2] muts = annotator.annotate_mutations(muts) ctr = 0 for m in muts: ctr += 1 return ctr
def testBlankAnnotatorInit(self): """ Test an extremely simple scenario, where no additional annotations are needed. I.e. no data sources """ self.logger.info("Starting Blank Annotator Init Test...") inputCreator = MafliteInputMutationCreator( 'testdata/maflite/tiny_maflite.maf.txt') outputRenderer = SimpleOutputRenderer( "out/testBlankAnnotatorTestFile.tsv") # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(inputCreator) annotator.setOutputRenderer(outputRenderer) testOutputFilename = annotator.annotate() # Test that file exists and that it has correct # of mutations (+1 for header +1 for annotator comment line). numSamples = 1 numExtraLines = 3 # one for header, two for comment lines numDoubleLines = 0 # Number of lines with two alt alleles numVariants = 9 gt = numSamples * numVariants + numDoubleLines * numSamples + numExtraLines fp = file(testOutputFilename, 'r') ctr = 0 for line in fp: ctr += 1 fp.close() self.assertEqual( ctr, gt, "Number of lines read was not correct: " + str(ctr) + " -- should have been: " + str(gt))
def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file): # For this conversion, you must specify the barcodes manually override_annotations = dict() override_annotations.update({ 'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal' }) other_opts = { OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True, OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True } # Use an empty datasource dir in order to speed this up. annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file, datasource_dir=".", global_annotations=override_annotations, is_skip_no_alts=True, other_opts=other_opts) annotator.initialize(runSpec) annotator.annotate()
def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False, other_opts=None): self.logger.info("Initializing Annotator...") if override_annotations is None: override_annotations = dict() if other_opts is None: other_opts = dict() annotator = Annotator() runSpec = RunSpecificationFactory.create_run_spec( inputFormat, outputFormat, inputFilename, outputFilename, default_annotations=default_annotations, datasource_dir=datasource_dir, global_annotations=override_annotations, is_skip_no_alts=is_skip_no_alts, other_opts=other_opts) annotator.initialize(runSpec) self.logger.info("Annotation starting...") return annotator.annotate()
def test_rendering_combined_to_tsv(self): """Test that we produce a merged ONP simple tsv file without crashing """ input_filename = os.path.join(*["testdata", "maflite", "onp_combination.maf.txt"]) output_filename = os.path.join("out", "onp_combination.tsv") spec = RunSpecificationFactory.create_run_spec("MAFLITE","SIMPLE_TSV",input_filename, output_filename, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def testVersionHeader(self): """ This method simply tests that the version string returned by the annotator does not cause an exception. Minimal checking that the returned sting is actually correct. Does not attempt to initialize input or output. Only a gaf datasource. """ annotator = Annotator() annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config)) tmp = annotator.createHeaderString() self.assertTrue(tmp.find("Gaf ") != -1 or tmp.find("GENCODE") != -1, "Could not find Gaf or GENCODE version in header string.") self.assertTrue(tmp.find("Oncotator") != -1, "Could not find the word Oncotator in header string.")
def testBasicAnnotation(self): ''' Annotate from a basic tsv gene file. Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated. Using trimmed CancerGeneCensus as basis for this test. ''' # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g" genesAvailable = [ 'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31', 'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC', 'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1', 'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B' ] # We need a gaf data source to annotate gene gafDatasource = TestUtils.createTranscriptProviderDatasource( config=self.config) geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") outputFilename = 'out/genericGeneTest.out.tsv' annotator = Annotator() annotator.setInputCreator( MafliteInputMutationCreator( 'testdata/maflite/Patient0.snp.maf.txt')) annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename)) annotator.addDatasource(gafDatasource) annotator.addDatasource(geneDS) annotator.annotate() # Check that there were actual annotations performed. tsvReader = GenericTsvReader(outputFilename) fields = tsvReader.getFieldNames() self.assertTrue( 'CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header") self.assertTrue( 'CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header") ctr = 1 linesThatShouldBeAnnotated = 0 for lineDict in tsvReader: self.assertTrue('gene' in lineDict.keys()) if lineDict['gene'] in genesAvailable: self.assertTrue( lineDict['CGC_Abridged_GeneID'] != '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated. Line: " + str(ctr)) linesThatShouldBeAnnotated += 1 ctr += 1 self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
def testNumberGRenderingOfRandomVcf(self): inputFilename = os.path.join(*["testdata", "vcf", "number_g.random.vcf"]) outputFilename = os.path.join("out", "number_g.random.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def test_querying_transcripts_by_genes(self): """Test that we can get all of the transcripts for a given set of genes. """ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) # Step 1 get all of the relevant transcripts txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"]) self.assertTrue(len(txs) > 3)
def test_simple_transcript_annotation(self): """Test web api backend call /transcript/ """ # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) tx = annotator.retrieve_transcript_by_id("ENST00000215832.6") self.assertTrue(tx is not None) self.assertTrue(tx.get_gene() == "MAPK1")
def test_single_sample_onp_combiner(self): """test that we can create an onp combined TCGA maf without crashing""" input_filename = 'testdata/maflite/onp.singlesample.maf.txt' output_filename = 'out/testSingleSampleOnpCombiner.maf' config = TestUtils.createUnitTestConfig() defaultdb = config.get('DEFAULT',"dbDir") spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename,datasourceDir=defaultdb, other_opts={OptionConstants.INFER_ONPS: True}) annotator = Annotator() annotator.initialize(spec) annotator.annotate()
def test_simple_genes_by_gene_annotation(self): """Test web api backend call /gene/ """ # http://www.broadinstitute.org/oncotator/gene/MAPK1/ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_genes(["MAPK1"]) self.assertTranscriptsFound(txs) mut_dict = annotator.annotate_genes_given_txs(txs) self.assertTrue(len(mut_dict.keys()) == 1)
def testSwitchedFieldsWithExampleVcf(self): """ Tests whether the switched tags are ignored. """ inputFilename = os.path.join(*["testdata", "vcf", "example.bad.switched.fields.vcf"]) outputFilename = os.path.join("out", "example.switched.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer)
def testAnnotationWithNoSampleNameExampleVcf(self): """ Tests whether parsed annotations match the actual annotations when the input is a VCF file that has no samples. """ inputFilename = os.path.join(*["testdata", "vcf", "example.sampleName.removed.vcf"]) outputFilename = os.path.join("out", "example.sampleName.removed.out.tsv") creator = VcfInputMutationCreator(inputFilename) renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testAnnotationWithDuplicateValuesInVcf(self): """ Tests the ability to parse a VCF that contains an INFO, FILTER, and INFO field with the same name. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_fields.vcf"]) outputFilename = os.path.join("out", "example.duplicate_fields2.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()