def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = { OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True } run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
def test_no_overwriting_muts(self): """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) self.assertRaises(DuplicateAnnotationException, annotator.annotate)
def test_overwriting_muts(self): """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation.""" # We will have an input with a "Who" annotation that this datasource will try to write. gene_ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/" ) input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite" output_filename = "out/who_alt1_vs_alt2.maf.annotated" input_format = "MAFLITE" output_format = "TCGAMAF" other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True} run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format, output_format, input_filename, output_filename, datasource_list=[gene_ds], other_opts=other_opts, ) annotator = Annotator() annotator.initialize(run_spec) annotator.annotate() tsv_reader = GenericTsvReader(output_filename) for i, line_dict in enumerate(tsv_reader): self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
def test_run_spec_creation_no_datasources(self): """Test that we can create a run spec with no datasources""" run_spec = RunSpecificationFactory.create_run_spec_given_datasources(input_format="VCF", input_filename="testdata/m2_support/phasingExample.vcf", output_format="TCGAMAF", output_filename="out/foo.maf.annotated", datasource_list=[]) self.assertTrue(isinstance(run_spec.inputCreator, InputMutationCreator)) self.assertTrue(isinstance(run_spec.outputRenderer, OutputRenderer)) self.assertTrue(run_spec.is_allow_annotation_overwriting==False)
def test_run_spec_creation_no_datasources(self): """Test that we can create a run spec with no datasources""" run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format="VCF", input_filename="testdata/m2_support/phasingExample.vcf", output_format="TCGAMAF", output_filename="out/foo.maf.annotated", datasource_list=[]) self.assertTrue(isinstance(run_spec.inputCreator, InputMutationCreator)) self.assertTrue(isinstance(run_spec.outputRenderer, OutputRenderer)) self.assertTrue(run_spec.is_allow_annotation_overwriting == False)
def test_tcgamaf_invalid_input_file(self): """Test a case where TCGAMAF specified as input and we get an error (as we should) for a missing file""" is_exception_seen = False try: run_spec = RunSpecificationFactory.create_run_spec_given_datasources(input_format="TCGAMAF", input_filename="testdata/Idonotexist", output_format="TCGAMAF", output_filename="out/foo.maf.annotated", datasource_list=[]) except IOError as ie: is_exception_seen = True self.assertTrue(is_exception_seen)
def test_tcgamaf_invalid_input_file(self): """Test a case where TCGAMAF specified as input and we get an error (as we should) for a missing file""" is_exception_seen = False try: run_spec = RunSpecificationFactory.create_run_spec_given_datasources( input_format="TCGAMAF", input_filename="testdata/Idonotexist", output_format="TCGAMAF", output_filename="out/foo.maf.annotated", datasource_list=[]) except IOError as ie: is_exception_seen = True self.assertTrue(is_exception_seen)
def test_reannotating_actual_file(self): """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values.""" # This test assumes that the numeric values are not being collapsed. input_filename = "testdata/m2_support/phasingExample.vcf" midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated" output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated" options_step1 = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: False, OptionConstants.SPLIT_ALLELIC_DEPTH: True, OptionConstants.INFER_ONPS: False} # Note that this will also test collapsing numeric values. options_step2 = {OptionConstants.REANNOTATE_TCGA_MAF_COLS: True, OptionConstants.INFER_ONPS: True, OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: False, OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True} run_spec_step1 = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_filename, midpoint_output_filename, is_skip_no_alts=True, other_opts=options_step1, datasource_dir=self._determine_db_dir()) annotator = Annotator() annotator.initialize(run_spec_step1) annotator.annotate() # To speed up this test, use the same datasources from step 1 ds_list = run_spec_step1.get_datasources() tsv_reader = GenericTsvReader(midpoint_output_filename) i = -1 for i, line in enumerate(tsv_reader): self.assertTrue(line["i_QSS"].find("|") == -1, "i_QSS annotation should not have a '|' in it in mutation: " + str(i+1)) self.assertTrue(i == 2, 'Mutation count flawed... should have been three mutations: ' + str(i+1)) run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources("TCGAMAF", "TCGAMAF", midpoint_output_filename, output_filename, other_opts=options_step2, datasource_list=ds_list) annotator.initialize(run_spec_step2) annotator.annotate() gt_alt_count = [80, 7] gt_alt_count_full = ["82|80", "7"] gt_ref_count = [68, 151] # Please note that this is not "68|68" since these were collapsed by ONP combiner. gt_ref_count_full = ["68", "151"] gt_tumor_f = [.5375, .046] gt_tumor_f_full = ["0.538|0.537", "0.046"] tsv_reader = GenericTsvReader(output_filename) i = -1 for i, line in enumerate(tsv_reader): is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()] self.assertTrue(all(is_good_prefix), "i_i_ prefix found.") if i == 0: self.assertTrue(line["i_QSS"].find("|") != -1, "i_QSS tag should have a '|' in it for the first mutation") self.assertEqual(int(line['t_alt_count']), gt_alt_count[i]) self.assertEqual(int(line['t_ref_count']), gt_ref_count[i]) self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i]) self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i]) self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i]) self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i]) self.assertTrue(i == 1, 'Mutation count flawed... should have been two mutations: ' + str(i+1))
def test_reannotating_actual_file(self): """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values.""" # This test assumes that the numeric values are not being collapsed. input_filename = "testdata/m2_support/phasingExample.vcf" midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated" output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated" options_step1 = { OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: False, OptionConstants.SPLIT_ALLELIC_DEPTH: True, OptionConstants.INFER_ONPS: False } # Note that this will also test collapsing numeric values. options_step2 = { OptionConstants.REANNOTATE_TCGA_MAF_COLS: True, OptionConstants.INFER_ONPS: True, OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: False, OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True } run_spec_step1 = RunSpecificationFactory.create_run_spec( "VCF", "TCGAMAF", input_filename, midpoint_output_filename, is_skip_no_alts=True, other_opts=options_step1, datasource_dir=self._determine_db_dir()) annotator = Annotator() annotator.initialize(run_spec_step1) annotator.annotate() # To speed up this test, use the same datasources from step 1 ds_list = run_spec_step1.get_datasources() tsv_reader = GenericTsvReader(midpoint_output_filename) i = -1 for i, line in enumerate(tsv_reader): self.assertTrue( line["i_QSS"].find("|") == -1, "i_QSS annotation should not have a '|' in it in mutation: " + str(i + 1)) self.assertTrue( i == 2, 'Mutation count flawed... should have been three mutations: ' + str(i + 1)) run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources( "TCGAMAF", "TCGAMAF", midpoint_output_filename, output_filename, other_opts=options_step2, datasource_list=ds_list) annotator.initialize(run_spec_step2) annotator.annotate() gt_alt_count = [80, 7] gt_alt_count_full = ["82|80", "7"] gt_ref_count = [68, 151] # Please note that this is not "68|68" since these were collapsed by ONP combiner. gt_ref_count_full = ["68", "151"] gt_tumor_f = [.5375, .046] gt_tumor_f_full = ["0.538|0.537", "0.046"] tsv_reader = GenericTsvReader(output_filename) i = -1 for i, line in enumerate(tsv_reader): is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()] self.assertTrue(all(is_good_prefix), "i_i_ prefix found.") if i == 0: self.assertTrue( line["i_QSS"].find("|") != -1, "i_QSS tag should have a '|' in it for the first mutation") self.assertEqual(int(line['t_alt_count']), gt_alt_count[i]) self.assertEqual(int(line['t_ref_count']), gt_ref_count[i]) self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i]) self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i]) self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i]) self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i]) self.assertTrue( i == 1, 'Mutation count flawed... should have been two mutations: ' + str(i + 1))