Example #1
0
    def test_no_overwriting_muts(self):
        """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/"
        )
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True}

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts,
        )
        annotator = Annotator()
        annotator.initialize(run_spec)

        self.assertRaises(DuplicateAnnotationException, annotator.annotate)
    def _annotateTest(self,
                      inputFilename,
                      outputFilename,
                      datasource_dir,
                      inputFormat="MAFLITE",
                      outputFormat="TCGAMAF",
                      default_annotations=TCGA_MAF_DEFAULTS,
                      override_annotations=None,
                      is_skip_no_alts=False,
                      other_opts=None):
        self.logger.info("Initializing Annotator...")

        if override_annotations is None:
            override_annotations = dict()

        if other_opts is None:
            other_opts = dict()

        annotator = Annotator()
        runSpec = RunSpecificationFactory.create_run_spec(
            inputFormat,
            outputFormat,
            inputFilename,
            outputFilename,
            default_annotations=default_annotations,
            datasource_dir=datasource_dir,
            global_annotations=override_annotations,
            is_skip_no_alts=is_skip_no_alts,
            other_opts=other_opts)
        annotator.initialize(runSpec)
        self.logger.info("Annotation starting...")
        return annotator.annotate()
Example #3
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/"
        )
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True}

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts,
        )
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
    def test_full_seg_file_annotations(self):
        """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_full_seg_file_annotations.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
            self.assertTrue(len(line_dict["genes"].split(",")) > 0)
Example #5
0
 def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file):
     # For this conversion, you must specify the barcodes manually
     override_annotations = dict()
     override_annotations.update({
         'tumor_barcode': 'Patient0-Tumor',
         'normal_barcode': 'Patient0-Normal'
     })
     other_opts = {
         OptionConstants.COLLAPSE_FILTER_COLS: True,
         OptionConstants.NO_PREPEND: True,
         OptionConstants.SPLIT_ALLELIC_DEPTH: False,
         OptionConstants.INFER_ONPS: True
     }
     # Use an empty datasource dir in order to speed this up.
     annotator = Annotator()
     runSpec = RunSpecificationFactory.create_run_spec(
         "VCF",
         "TCGAMAF",
         input_vcf_file,
         output_tcgamaf_file,
         datasource_dir=".",
         global_annotations=override_annotations,
         is_skip_no_alts=True,
         other_opts=other_opts)
     annotator.initialize(runSpec)
     annotator.annotate()
Example #6
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
    def testAnnotationWithMafliteWithTrailingSpaces(self):
        """
        Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles.
        """
        db_dir = self.config.get('DEFAULT', "dbDir")
        inputFilename = os.path.join(
            *["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"])
        outputFilename = os.path.join(
            "out", "example.trailing_whitespace_in_alleles.vcf")

        annotator = Annotator()
        from oncotator.utils.RunSpecification import RunSpecification
        run_spec = RunSpecificationFactory.create_run_spec(
            "VCF",
            "VCF",
            inputFilename,
            outputFilename,
            datasource_dir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_MUTATIONS,
            other_opts={'vcf_out_infer_genotypes': False})
        annotator.initialize(run_spec)
        annotator.annotate()

        #check output
        vcf_data = open(outputFilename).read()
        self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data)
        self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data)
        self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data)
        self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
    def test_basic_rendering(self):
        """Test that we can render a basic seg file as a gene list"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_basic_rendering.gene_list.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            self.assertTrue(line_dict['segment_end'] is not None)
            self.assertTrue(line_dict['segment_end'].strip() != "")
            self.assertTrue("gene" in line_dict.keys())
            self.assertTrue(len(line_dict["gene"]) > 0)
            self.assertTrue(float(line_dict["segment_num_probes"]))
            self.assertTrue(line_dict['sample'] == "Patient0")
    def test_rendering_with_exons(self):
        """Test that we can render a seg file that includes exons at end points"""
        inputFilename = "testdata/seg/Middle_of_exon.seg.txt"
        output_filename = "out/test_exon_seg2.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            if line_dict['segment_end_gene'] == "MAPK1":
                self.assertTrue(
                    line_dict['segment_end_exon'].strip() == "8+",
                    "Should have been 8+, but saw: %s" %
                    line_dict['segment_end_exon'].strip())
    def test_full_seg_file_annotations(self):
        """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_full_seg_file_annotations.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "SIMPLE_TSV",
            inputFilename,
            output_filename,
            datasource_dir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
            self.assertTrue(len(line_dict["genes"].split(",")) > 0)
    def test_basic_rendering(self):
        """Test that we can render a basic seg file as a gene list"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_basic_rendering.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            self.assertTrue(line_dict['segment_end'] is not None)
            self.assertTrue(line_dict['segment_end'].strip() != "")
            self.assertTrue("gene" in line_dict.keys())
            self.assertTrue(len(line_dict["gene"]) > 0)
            self.assertTrue(float(line_dict["segment_num_probes"]))
            self.assertTrue(line_dict['sample'] == "Patient0")
Example #12
0
    def test_no_overwriting_muts(self):
        """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        self.assertRaises(DuplicateAnnotationException, annotator.annotate)
Example #13
0
    def testAnnotationRoundTripEmpty(self):
        """Read a VCF, annotate it with no datasources, write it, and read it again without changes"""
        inputFilename = os.path.join(
            *["testdata", "m2_support", "NA12878.ob_filtered.vcf"])
        outputFilename = os.path.join("out",
                                      "test_round_trip_empty_annotated.vcf")

        other_opts = dict()
        other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True

        run_spec = RunSpecificationFactory.create_run_spec(
            "VCF",
            "VCF",
            inputFilename,
            outputFilename,
            datasource_dir="THIS_DIR_DOES_NOT_EXIST__",
            genomeBuild="hg19",
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)
        annotated_filename = annotator.annotate()

        vcf_input2 = VcfInputMutationCreator(
            annotated_filename,
            MutationDataFactory(allow_overwriting=True),
            other_options=other_opts)
        muts2 = [m for m in vcf_input2.createMutations()]
        self.assertTrue(len(muts2) > 0)
Example #14
0
 def test_rendering_combined_to_tsv(self):
     """Test that we produce a merged ONP simple tsv file without crashing """
     input_filename = os.path.join(*["testdata", "maflite", "onp_combination.maf.txt"])
     output_filename = os.path.join("out", "onp_combination.tsv")
     spec = RunSpecificationFactory.create_run_spec("MAFLITE","SIMPLE_TSV",input_filename, output_filename,
                                             other_opts={OptionConstants.INFER_ONPS: True})
     annotator = Annotator()
     annotator.initialize(spec)
     annotator.annotate()
Example #15
0
 def test_rendering_combined_to_tsv(self):
     """Test that we produce a merged ONP simple tsv file without crashing """
     input_filename = os.path.join(*["testdata", "maflite", "onp_combination.maf.txt"])
     output_filename = os.path.join("out", "onp_combination.tsv")
     spec = RunSpecificationFactory.create_run_spec("MAFLITE","SIMPLE_TSV",input_filename, output_filename,
                                             other_opts={OptionConstants.INFER_ONPS: True})
     annotator = Annotator()
     annotator.initialize(spec)
     annotator.annotate()
 def test_run_spec_creation_no_datasources(self):
     """Test that we can create a run spec with no datasources"""
     run_spec = RunSpecificationFactory.create_run_spec_given_datasources(input_format="VCF",
                                                                          input_filename="testdata/m2_support/phasingExample.vcf",
                                                                 output_format="TCGAMAF",
                                                                 output_filename="out/foo.maf.annotated",
                                                                 datasource_list=[])
     self.assertTrue(isinstance(run_spec.inputCreator, InputMutationCreator))
     self.assertTrue(isinstance(run_spec.outputRenderer, OutputRenderer))
     self.assertTrue(run_spec.is_allow_annotation_overwriting==False)
    def _annotateTest(self, inputFilename, outputFilename, datasource_dir, inputFormat="MAFLITE", outputFormat="TCGAMAF", default_annotations=TCGA_MAF_DEFAULTS, override_annotations=None, is_skip_no_alts=False):
        self.logger.info("Initializing Annotator...")

        if override_annotations is None:
            override_annotations = dict()

        annotator = Annotator()
        runSpec = RunSpecificationFactory.create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename, defaultAnnotations=default_annotations, datasourceDir=datasource_dir, globalAnnotations=override_annotations, is_skip_no_alts=is_skip_no_alts)
        annotator.initialize(runSpec)
        self.logger.info("Annotation starting...")
        return annotator.annotate()
Example #18
0
 def test_single_sample_onp_combiner(self):
     """test that we can create an onp combined TCGA maf without crashing"""
     input_filename = 'testdata/maflite/onp.singlesample.maf.txt'
     output_filename = 'out/testSingleSampleOnpCombiner.maf'
     config = TestUtils.createUnitTestConfig()
     defaultdb = config.get('DEFAULT',"dbDir")
     spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename,datasourceDir=defaultdb,
                                             other_opts={OptionConstants.INFER_ONPS: True})
     annotator = Annotator()
     annotator.initialize(spec)
     annotator.annotate()
Example #19
0
 def test_single_sample_onp_combiner(self):
     """test that we can create an onp combined TCGA maf without crashing"""
     input_filename = 'testdata/maflite/onp.singlesample.maf.txt'
     output_filename = 'out/testSingleSampleOnpCombiner.maf'
     config = TestUtils.createUnitTestConfig()
     defaultdb = config.get('DEFAULT',"dbDir")
     spec = RunSpecificationFactory.create_run_spec("MAFLITE","TCGAMAF", input_filename, output_filename,
                                                    datasource_dir=defaultdb,
                                             other_opts={OptionConstants.INFER_ONPS: True})
     annotator = Annotator()
     annotator.initialize(spec)
     annotator.annotate()
Example #20
0
 def test_run_spec_creation_no_datasources(self):
     """Test that we can create a run spec with no datasources"""
     run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
         input_format="VCF",
         input_filename="testdata/m2_support/phasingExample.vcf",
         output_format="TCGAMAF",
         output_filename="out/foo.maf.annotated",
         datasource_list=[])
     self.assertTrue(isinstance(run_spec.inputCreator,
                                InputMutationCreator))
     self.assertTrue(isinstance(run_spec.outputRenderer, OutputRenderer))
     self.assertTrue(run_spec.is_allow_annotation_overwriting == False)
    def testAnnotationWithMafliteWithTrailingSpaces(self):
        """
        Tests the ability to annotate a maflite file that contains trailing spaces in ref and alt alleles.
        """
        db_dir = self.config.get('DEFAULT',"dbDir")
        inputFilename = os.path.join(*["testdata", "maflite", "example.trailing_whitespace_in_alleles.maflite"])
        outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.maf.txt")

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", inputFilename, outputFilename,
                                                           datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS)
        annotator.initialize(run_spec)
        annotator.annotate()
    def test_tcgamaf_invalid_input_file(self):
        """Test a case where TCGAMAF specified as input and we get an error (as we should) for a missing file"""
        is_exception_seen = False
        try:
            run_spec = RunSpecificationFactory.create_run_spec_given_datasources(input_format="TCGAMAF",
                                                                             input_filename="testdata/Idonotexist",
                                                                    output_format="TCGAMAF",
                                                                    output_filename="out/foo.maf.annotated",
                                                                    datasource_list=[])
        except IOError as ie:
            is_exception_seen = True

        self.assertTrue(is_exception_seen)
Example #23
0
 def _annotate_m2_vcf(self, input_vcf_file, output_tcgamaf_file):
     # For this conversion, you must specify the barcodes manually
     override_annotations = dict()
     override_annotations.update({'tumor_barcode': 'Patient0-Tumor', 'normal_barcode': 'Patient0-Normal'})
     other_opts = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: True,
                   OptionConstants.SPLIT_ALLELIC_DEPTH: False, OptionConstants.INFER_ONPS: True}
     # Use an empty datasource dir in order to speed this up.
     annotator = Annotator()
     runSpec = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_vcf_file, output_tcgamaf_file,
                                                       datasource_dir=".", global_annotations=override_annotations,
                                                       is_skip_no_alts=True, other_opts=other_opts)
     annotator.initialize(runSpec)
     annotator.annotate()
Example #24
0
    def test_tcgamaf_invalid_input_file(self):
        """Test a case where TCGAMAF specified as input and we get an error (as we should) for a missing file"""
        is_exception_seen = False
        try:
            run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
                input_format="TCGAMAF",
                input_filename="testdata/Idonotexist",
                output_format="TCGAMAF",
                output_filename="out/foo.maf.annotated",
                datasource_list=[])
        except IOError as ie:
            is_exception_seen = True

        self.assertTrue(is_exception_seen)
    def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self):
        """Test FILTER col is properly rendered when using the collapse-filter-cols option."""

        input_fname = 'testdata/vcf/example.vcf'
        output_fname = 'out/example.one_filter_col.maf.txt'
        annotator = Annotator()
        other_opts = {'collapse_filter_cols': True}

        run_spec = RunSpecificationFactory.create_run_spec(
            'VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts)
        annotator.initialize(run_spec)
        annotator.annotate()

        tsv_reader = GenericTsvReader(output_fname)
        for line_dict in tsv_reader:
            self.assertIn('i_filter', line_dict)
            self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
    def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self):
        """Test FILTER col is properly rendered when using the collapse-filter-cols option."""

        input_fname = 'testdata/vcf/example.vcf'
        output_fname = 'out/example.one_filter_col.maf.txt'
        annotator = Annotator()
        other_opts = {'collapse_filter_cols': True}

        from oncotator.utils.RunSpecification import RunSpecification
        run_spec = RunSpecificationFactory.create_run_spec('VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts)
        annotator.initialize(run_spec)
        annotator.annotate()

        tsv_reader = GenericTsvReader(output_fname)
        for line_dict in tsv_reader:
            self.assertIn('i_filter', line_dict)
            self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
    def test_annotating_uniprot_test_file(self):
        """Test variants with known issues with older version of UniProt datasource. This test will fail if using older version of uniprot datasource (pre-2014) """
        db_dir = TestUtils.createUnitTestConfig().get('DEFAULT',"dbDir")
        annotator = Annotator()
        out_file_name = "out/uniprot_recovery.maf.annotated"
        runSpec = RunSpecificationFactory.create_run_spec("MAFLITE", "TCGAMAF", "testdata/maflite/uniprot_recovery.maflite",
                                                          out_file_name, datasource_dir=db_dir, tx_mode=TranscriptProvider.TX_MODE_BEST_EFFECT)
        annotator.initialize(runSpec)
        annotator.annotate()

        out_file_reader = GenericTsvReader(out_file_name)
        for i,line_dict in enumerate(out_file_reader):
            self.assertTrue(line_dict['UniProt_AApos'] != "0")

            #TODO: The fourth entry is currently not picking up the uniprot entry for this.  Remove the "if" statement once issue #253 is addressed
            if i != 4:
                self.assertTrue(line_dict['SwissProt_entry_Id'].endswith("HUMAN"))
Example #28
0
 def test_onp_combiner_snp_then_multiallelic(self):
     """test that we can handle reading a SNP then multiallelic from a VCF without crashing"""
     input_filename = 'testdata/vcf/infer_onp_fail_snp_then_multiallelic.vcf'
     output_filename = 'out/testSNPThenMultiallelic.maf.annotated'
     config = TestUtils.createUnitTestConfig()
     default_db = config.get('DEFAULT', "dbDir")
     spec = RunSpecificationFactory.create_run_spec(
         "VCF",
         "TCGAMAF",
         input_filename,
         output_filename,
         datasource_dir=default_db,
         is_skip_no_alts=True,
         other_opts={
             OptionConstants.INFER_ONPS: True,
             OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True
         })
     annotator = Annotator()
     annotator.initialize(spec)
     annotator.annotate()
    def testAnnotationWithMafliteWithTrailingSpaces(self):
        """
        Tests the ability to annotate a maflite file that contains trailing spaces in ref and alt alleles.
        """
        db_dir = self.config.get('DEFAULT', "dbDir")
        inputFilename = os.path.join(*[
            "testdata", "maflite",
            "example.trailing_whitespace_in_alleles.maflite"
        ])
        outputFilename = os.path.join(
            "out", "example.trailing_whitespace_in_alleles.maf.txt")

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "MAFLITE",
            "TCGAMAF",
            inputFilename,
            outputFilename,
            datasource_dir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_MUTATIONS)
        annotator.initialize(run_spec)
        annotator.annotate()
    def testAnnotationWithMafliteWithTrailingSpaces(self):
        """
        Tests the ability to annotate a VCF file that contains trailing spaces in ref and alt alleles.
        """
        db_dir = self.config.get('DEFAULT',"dbDir")
        inputFilename = os.path.join(*["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"])
        outputFilename = os.path.join("out", "example.trailing_whitespace_in_alleles.vcf")

        annotator = Annotator()
        from oncotator.utils.RunSpecification import RunSpecification
        run_spec = RunSpecificationFactory.create_run_spec("VCF", "VCF", inputFilename, outputFilename,
                                                           datasource_dir=db_dir, annotating_type=RunSpecification.ANNOTATE_MUTATIONS,
                                                           other_opts={'vcf_out_infer_genotypes': False})
        annotator.initialize(run_spec)
        annotator.annotate()

        #check output
        vcf_data = open(outputFilename).read()
        self.assertIn('\n1\t14907\t.\tA\tG\t', vcf_data)
        self.assertIn('\n1\t14930\trs150145850\tA\tG\t', vcf_data)
        self.assertIn('\n1\t14933\trs138566748\tG\tA\t', vcf_data)
        self.assertIn('\n1\t14948\trs148911281\tG\tA\t', vcf_data)
    def test_annotating_uniprot_test_file(self):
        """Test variants with known issues with older version of UniProt datasource. This test will fail if using older version of uniprot datasource (pre-2014) """
        db_dir = TestUtils.createUnitTestConfig().get('DEFAULT', "dbDir")
        annotator = Annotator()
        out_file_name = "out/uniprot_recovery.maf.annotated"
        runSpec = RunSpecificationFactory.create_run_spec(
            "MAFLITE",
            "TCGAMAF",
            "testdata/maflite/uniprot_recovery.maflite",
            out_file_name,
            datasourceDir=db_dir,
            tx_mode=TranscriptProvider.TX_MODE_BEST_EFFECT)
        annotator.initialize(runSpec)
        annotator.annotate()

        out_file_reader = GenericTsvReader(out_file_name)
        for i, line_dict in enumerate(out_file_reader):
            self.assertTrue(line_dict['UniProt_AApos'] != "0")

            #TODO: The fourth entry is currently not picking up the uniprot entry for this.  Remove the "if" statement once issue #253 is addressed
            if i != 4:
                self.assertTrue(
                    line_dict['SwissProt_entry_Id'].endswith("HUMAN"))
    def test_rendering_with_exons(self):
        """Test that we can render a seg file that includes exons at end points"""
        inputFilename = "testdata/seg/Middle_of_exon.seg.txt"
        output_filename = "out/test_exon_seg2.gene_list.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            if line_dict['segment_end_gene'] == "MAPK1":
                self.assertTrue(line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
    def test_reannotating_actual_file(self):
        """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values."""
        # This test assumes that the numeric values are not being collapsed.
        input_filename = "testdata/m2_support/phasingExample.vcf"
        midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated"
        output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated"

        options_step1 = {
            OptionConstants.COLLAPSE_FILTER_COLS: True,
            OptionConstants.NO_PREPEND: False,
            OptionConstants.SPLIT_ALLELIC_DEPTH: True,
            OptionConstants.INFER_ONPS: False
        }

        # Note that this will also test collapsing numeric values.
        options_step2 = {
            OptionConstants.REANNOTATE_TCGA_MAF_COLS: True,
            OptionConstants.INFER_ONPS: True,
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True,
            OptionConstants.NO_PREPEND: False,
            OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True
        }

        run_spec_step1 = RunSpecificationFactory.create_run_spec(
            "VCF",
            "TCGAMAF",
            input_filename,
            midpoint_output_filename,
            is_skip_no_alts=True,
            other_opts=options_step1,
            datasource_dir=self._determine_db_dir())

        annotator = Annotator()
        annotator.initialize(run_spec_step1)
        annotator.annotate()

        # To speed up this test, use the same datasources from step 1
        ds_list = run_spec_step1.get_datasources()

        tsv_reader = GenericTsvReader(midpoint_output_filename)
        i = -1
        for i, line in enumerate(tsv_reader):
            self.assertTrue(
                line["i_QSS"].find("|") == -1,
                "i_QSS annotation should not have a '|' in it in mutation: " +
                str(i + 1))
        self.assertTrue(
            i == 2,
            'Mutation count flawed... should have been three mutations: ' +
            str(i + 1))

        run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources(
            "TCGAMAF",
            "TCGAMAF",
            midpoint_output_filename,
            output_filename,
            other_opts=options_step2,
            datasource_list=ds_list)

        annotator.initialize(run_spec_step2)
        annotator.annotate()

        gt_alt_count = [80, 7]
        gt_alt_count_full = ["82|80", "7"]
        gt_ref_count = [68, 151]

        # Please note that this is not "68|68" since these were collapsed by ONP combiner.
        gt_ref_count_full = ["68", "151"]

        gt_tumor_f = [.5375, .046]
        gt_tumor_f_full = ["0.538|0.537", "0.046"]

        tsv_reader = GenericTsvReader(output_filename)
        i = -1
        for i, line in enumerate(tsv_reader):
            is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()]
            self.assertTrue(all(is_good_prefix), "i_i_ prefix found.")
            if i == 0:
                self.assertTrue(
                    line["i_QSS"].find("|") != -1,
                    "i_QSS tag should have a '|' in it for the first mutation")
            self.assertEqual(int(line['t_alt_count']), gt_alt_count[i])
            self.assertEqual(int(line['t_ref_count']), gt_ref_count[i])
            self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i])

            self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i])
            self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i])
            self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i])

        self.assertTrue(
            i == 1,
            'Mutation count flawed... should have been two mutations: ' +
            str(i + 1))
Example #34
0
def main(argv=None):  # IGNORE:C0111
    """Command line options."""
    from oncotator.utils.OncotatorCLIUtils import OncotatorCLIUtils
    from oncotator.Annotator import Annotator

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_version = "%s" % __version__
    program_version_message = '%%(prog)s %s' % program_version

    try:
        args = parseOptions(program_version_message)
        verbose = args.verbose
        if verbose > 0:
            print("Verbose mode on")
        
        logFilename = args.log_name  # 'oncotator.log'

        # Create a basic logger to a file
        loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s'
        logging.basicConfig(filename=logFilename, level=logging.INFO, format=loggingFormat)
        
        
        # Add a console logger to the root logger, which means that all loggers generated will have the console dump.  
        #    Output on the console will be the same as what is in the log file. 
        ch = logging.StreamHandler()
        ch.setLevel(logging.WARN)
        formatter = logging.Formatter(loggingFormat)
        ch.setFormatter(formatter)
        
        if verbose:
            ch.setLevel(logging.INFO)
            print("Path:")
            print(sys.path)
            print(" ")
        
        logging.getLogger('').addHandler(ch)
        
        logger = logging.getLogger(__name__)
        logger.info("Oncotator " + program_version)
        logger.info("Args: " + str(args))
        logger.info('Log file: ' + os.path.abspath(logFilename))
        
        if DEBUG:
            logger.setLevel(logging.DEBUG)
        
        # Initiate an Oncotator session.
        inputFilename = os.path.expanduser(args.input_file)
        outputFilename = os.path.expanduser(args.output_file)
        inputFormat = args.input_format.upper()
        outputFormat = args.output_format.upper()

        datasourceDir = os.path.expanduser(args.dbDir)
        cache_url = args.cache_url
        read_only_cache = args.read_only_cache
        tx_mode = args.tx_mode
        is_skip_no_alts = args.skip_no_alt
        genome_build = args.genome_build
        is_no_prepend = not args.prepend

        # Parse annotation overrides
        commandLineManualOverrides = args.override_cli
        overrideConfigFile = args.override_config
        if overrideConfigFile is not None and not os.path.exists(overrideConfigFile):
            logger.warn("Could not find " + overrideConfigFile + "   ... proceeding anyway.")
            overrideConfigFile = None
        manualOverrides = OncotatorCLIUtils.determineAllAnnotationValues(commandLineManualOverrides, overrideConfigFile)

        # Parse default overrides
        commandLineDefaultValues = args.default_cli
        defaultConfigFile = args.default_config
        if defaultConfigFile is not None and not os.path.exists(defaultConfigFile):
            if defaultConfigFile != DEFAULT_DEFAULT_ANNOTATIONS:
                logger.warn("Could not find " + defaultConfigFile + "   ... proceeding anyway.")
            else:
                logger.info("Could not find Broad-specific " + defaultConfigFile + "   ... proceeding without any default annotations.  __UNKNOWN__ may appear in TCGA MAF outputs.")
            defaultConfigFile = None
        defaultValues = OncotatorCLIUtils.determineAllAnnotationValues(commandLineDefaultValues, defaultConfigFile)

        # Create a run configuration to pass to the Annotator class.
        annotating_type = None
        if inputFormat == "SEG_FILE":
            annotating_type = RunSpecification.ANNOTATE_SEGMENTS
        runConfig = RunSpecificationFactory.create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename,
                                                      globalAnnotations=manualOverrides, datasourceDir=datasourceDir,
                                                      isMulticore=(not args.noMulticore),
                                                      defaultAnnotations=defaultValues, cacheUrl=cache_url,
                                                      read_only_cache=read_only_cache, tx_mode=tx_mode,
                                                      is_skip_no_alts=is_skip_no_alts, genomeBuild=genome_build,
                                                      other_opts=determineOtherOptions(args), annotating_type=annotating_type)

        annotator = Annotator()
        annotator.initialize(runConfig)
        annotator.annotate()
        
        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
Example #35
0
def main(argv=None):  # IGNORE:C0111
    """Command line options."""
    from oncotator.utils.OncotatorCLIUtils import OncotatorCLIUtils
    from oncotator.Annotator import Annotator

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_version = "%s" % __version__
    program_version_message = '%%(prog)s %s' % program_version

    try:
        args = parseOptions(program_version_message)
        verbose = args.verbose
        if verbose > 0:
            print("Verbose mode on")

        logFilename = args.log_name  # 'oncotator.log'

        # Create a basic logger to a file
        loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s'
        logging.basicConfig(filename=logFilename,
                            level=logging.INFO,
                            format=loggingFormat)

        # Add a console logger to the root logger, which means that all loggers generated will have the console dump.
        #    Output on the console will be the same as what is in the log file.
        ch = logging.StreamHandler()
        ch.setLevel(logging.WARN)
        formatter = logging.Formatter(loggingFormat)
        ch.setFormatter(formatter)

        if verbose:
            ch.setLevel(logging.INFO)
            print("Path:")
            print(sys.path)
            print(" ")

        logging.getLogger('').addHandler(ch)

        logger = logging.getLogger(__name__)
        logger.info("Oncotator " + program_version)
        logger.info("Args: " + str(args))
        logger.info('Log file: ' + os.path.abspath(logFilename))

        if DEBUG:
            logger.setLevel(logging.DEBUG)

        if not NGSLIB_INSTALLED:
            logger.warn(
                "ngslib module not installed.  Will be unable to annotate with BigWig datasources."
            )

        # Initiate an Oncotator session.
        inputFilename = os.path.expanduser(args.input_file)
        outputFilename = os.path.expanduser(args.output_file)
        inputFormat = args.input_format.upper()
        outputFormat = args.output_format.upper()

        datasourceDir = os.path.expanduser(args.dbDir)
        cache_url = args.cache_url
        read_only_cache = args.read_only_cache
        tx_mode = args.tx_mode
        is_skip_no_alts = args.skip_no_alt
        genome_build = args.genome_build
        is_no_prepend = not args.prepend

        # Parse annotation overrides
        commandLineManualOverrides = args.override_cli
        overrideConfigFile = args.override_config
        if overrideConfigFile is not None and not os.path.exists(
                overrideConfigFile):
            logger.warn("Could not find " + overrideConfigFile +
                        "   ... proceeding anyway.")
            overrideConfigFile = None
        manualOverrides = OncotatorCLIUtils.determineAllAnnotationValues(
            commandLineManualOverrides, overrideConfigFile)

        # Parse default overrides
        commandLineDefaultValues = args.default_cli
        defaultConfigFile = args.default_config
        if defaultConfigFile is not None and not os.path.exists(
                defaultConfigFile):
            if defaultConfigFile != DEFAULT_DEFAULT_ANNOTATIONS:
                logger.warn("Could not find " + defaultConfigFile +
                            "   ... proceeding anyway.")
            else:
                logger.info(
                    "Could not find Broad-specific " + defaultConfigFile +
                    "   ... proceeding without any default annotations.  __UNKNOWN__ may appear in TCGA MAF outputs."
                )
            defaultConfigFile = None
        defaultValues = OncotatorCLIUtils.determineAllAnnotationValues(
            commandLineDefaultValues, defaultConfigFile)

        # Create a run configuration to pass to the Annotator class.
        annotating_type = None
        if inputFormat == "SEG_FILE":
            annotating_type = RunSpecification.ANNOTATE_SEGMENTS
        runConfig = RunSpecificationFactory.create_run_spec(
            inputFormat,
            outputFormat,
            inputFilename,
            outputFilename,
            globalAnnotations=manualOverrides,
            datasourceDir=datasourceDir,
            isMulticore=(not args.noMulticore),
            defaultAnnotations=defaultValues,
            cacheUrl=cache_url,
            read_only_cache=read_only_cache,
            tx_mode=tx_mode,
            is_skip_no_alts=is_skip_no_alts,
            genomeBuild=genome_build,
            other_opts=determineOtherOptions(args),
            annotating_type=annotating_type)

        annotator = Annotator()
        annotator.initialize(runConfig)
        annotator.annotate()

        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    def test_reannotating_actual_file(self):
        """Test that we can take in a file, annotate, similar to M2 process (VCF to TCGA MAF no ONPs, then TCGA MAF to TCGA MAF with ONPs) and collapse values."""
        # This test assumes that the numeric values are not being collapsed.
        input_filename = "testdata/m2_support/phasingExample.vcf"
        midpoint_output_filename = "out/m2_support/reannotating_tcga_maf_midpoint.maf.annotated"
        output_filename = "out/m2_support/reannotating_tcga_maf.maf.annotated"

        options_step1 = {OptionConstants.COLLAPSE_FILTER_COLS: True, OptionConstants.NO_PREPEND: False,
                         OptionConstants.SPLIT_ALLELIC_DEPTH: True, OptionConstants.INFER_ONPS: False}

        # Note that this will also test collapsing numeric values.
        options_step2 = {OptionConstants.REANNOTATE_TCGA_MAF_COLS: True, OptionConstants.INFER_ONPS: True,
                   OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: False,
                   OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS: True}

        run_spec_step1 = RunSpecificationFactory.create_run_spec("VCF", "TCGAMAF", input_filename, midpoint_output_filename,
                                                                 is_skip_no_alts=True, other_opts=options_step1,
                                                                 datasource_dir=self._determine_db_dir())

        annotator = Annotator()
        annotator.initialize(run_spec_step1)
        annotator.annotate()

        # To speed up this test, use the same datasources from step 1
        ds_list = run_spec_step1.get_datasources()

        tsv_reader = GenericTsvReader(midpoint_output_filename)
        i = -1
        for i, line in enumerate(tsv_reader):
            self.assertTrue(line["i_QSS"].find("|") == -1, "i_QSS annotation should not have a '|' in it in mutation: " + str(i+1))
        self.assertTrue(i == 2, 'Mutation count flawed... should have been three mutations: ' + str(i+1))


        run_spec_step2 = RunSpecificationFactory.create_run_spec_given_datasources("TCGAMAF", "TCGAMAF", midpoint_output_filename, output_filename,
                                                                 other_opts=options_step2, datasource_list=ds_list)

        annotator.initialize(run_spec_step2)
        annotator.annotate()

        gt_alt_count = [80, 7]
        gt_alt_count_full = ["82|80", "7"]
        gt_ref_count = [68, 151]

        # Please note that this is not "68|68" since these were collapsed by ONP combiner.
        gt_ref_count_full = ["68", "151"]

        gt_tumor_f = [.5375, .046]
        gt_tumor_f_full = ["0.538|0.537", "0.046"]

        tsv_reader = GenericTsvReader(output_filename)
        i = -1
        for i, line in enumerate(tsv_reader):
            is_good_prefix = [not ks.startswith('i_i_') for ks in line.keys()]
            self.assertTrue(all(is_good_prefix), "i_i_ prefix found.")
            if i == 0:
                self.assertTrue(line["i_QSS"].find("|") != -1, "i_QSS tag should have a '|' in it for the first mutation")
            self.assertEqual(int(line['t_alt_count']), gt_alt_count[i])
            self.assertEqual(int(line['t_ref_count']), gt_ref_count[i])
            self.assertEqual(float(line['i_tumor_f']), gt_tumor_f[i])

            self.assertEqual(line['i_t_alt_count_full'], gt_alt_count_full[i])
            self.assertEqual(line['i_t_ref_count_full'], gt_ref_count_full[i])
            self.assertEqual(line['i_tumor_f_full'], gt_tumor_f_full[i])

        self.assertTrue(i == 1, 'Mutation count flawed... should have been two mutations: ' + str(i+1))