def testInitializingDatasources(self):
        """ Test initializing a database dir, both single and multicore.  This test is RAM intensive and requires default data corpus."""

        multiDS = DatasourceFactory.createDatasources(self.config.get(
            "DEFAULT", "dbDir"),
                                                      "hg19",
                                                      isMulticore=True)
        self.assertTrue(multiDS is not None, "Datasource list was None")
        self.assertTrue(len(multiDS) != 0, "Datasource list was empty")
        for i in range(0, len(multiDS)):
            self.assertTrue(multiDS[i] is not None,
                            "multi core datasource was None:  " + str(i))
            self.assertTrue(isinstance(multiDS[i], Datasource))

        # This test can be memory intensive, so get rid of the multiDS, but record how many datasources were created.
        numMultiDS = len(multiDS)
        del multiDS

        singleCoreDS = DatasourceFactory.createDatasources(self.config.get(
            "DEFAULT", "dbDir"),
                                                           "hg19",
                                                           isMulticore=False)
        self.assertTrue(singleCoreDS is not None, "Datasource list was None")
        self.assertTrue(len(singleCoreDS) != 0, "Datasource list was empty")
        for i in range(0, len(singleCoreDS)):
            self.assertTrue(singleCoreDS[i] is not None,
                            "single core datasource was None:  " + str(i))
            self.assertTrue(isinstance(singleCoreDS[i], Datasource))

        self.assertTrue(
            numMultiDS == len(singleCoreDS),
            "Length of single core datasource list was not the same as multicore"
        )
        del singleCoreDS
 def testMulticoreNoDatasources(self):
     """ If using multicore, does not hang when no datasources are in the db dir"""
     multiDS = DatasourceFactory.createDatasources('testdata/maflite/',
                                                   "hg19", True)
     self.assertTrue(
         len(multiDS) == 0,
         "Length of multiDS when there were no datasources was not zero.")
Exemple #3
0
    def testAnnotateListOfMutations(self):
        """Test that we can initialize an Annotator, without an input or output and then feed mutations,
        one at a time... using a runspec"""

        # Locate the datasource directory and create a runspec
        dbDir = self.config.get("DEFAULT", "dbDir")
        ds = DatasourceFactory.createDatasources(dbDir)
        runSpec = RunSpecification()
        runSpec.initialize(None, None, datasources=ds)

        # Initialize the annotator with the runspec
        annotator = Annotator()
        annotator.initialize(runSpec)

        m = MutationData()
        m.chr = "1"
        m.start = "12941796"
        m.end = "12941796"
        m.alt_allele = "G"
        m.ref_allele = "T"

        muts = [m]

        muts = annotator.annotate_mutations(muts)
        m2 = muts.next()
        self.assertTrue(m2.get("gene", None) is not None)
Exemple #4
0
    def testAnnotateListOfMutations(self):
        """Test that we can initialize an Annotator, without an input or output and then feed mutations,
        one at a time... using a runspec"""

        # Locate the datasource directory and create a runspec
        dbDir = self.config.get("DEFAULT", "dbDir")
        ds = DatasourceFactory.createDatasources(dbDir)
        runSpec = RunSpecification()
        runSpec.initialize(None, None, datasources=ds)

        # Initialize the annotator with the runspec
        annotator = Annotator()
        annotator.initialize(runSpec)

        m = MutationData()
        m.chr = "1"
        m.start = "12941796"
        m.end = "12941796"
        m.alt_allele = "G"
        m.ref_allele = "T"

        muts = [m]

        muts = annotator.annotate_mutations(muts)
        m2 = muts.next()
        self.assertTrue(m2.get("gene", None) is not None)
Exemple #5
0
    def test_simple_transcript_annotation(self):
        """Test web api backend call /transcript/ """
        # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        tx = annotator.retrieve_transcript_by_id("ENST00000215832.6")
        self.assertTrue(tx is not None)
        self.assertTrue(tx.get_gene() == "MAPK1")
Exemple #6
0
    def test_querying_transcripts_by_genes(self):
        """Test that we can get all of the transcripts for a given set of genes. """

        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Step 1 get all of the relevant transcripts
        txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"])
        self.assertTrue(len(txs) > 3)
Exemple #7
0
    def test_simple_transcript_annotation(self):
        """Test web api backend call /transcript/ """
        # http://www.broadinstitute.org/oncotator/transcript/ENST00000215832.6/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        tx = annotator.retrieve_transcript_by_id("ENST00000215832.6")
        self.assertTrue(tx is not None)
        self.assertTrue(tx.get_gene() == "MAPK1")
Exemple #8
0
    def test_querying_transcripts_by_genes(self):
        """Test that we can get all of the transcripts for a given set of genes. """

        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Step 1 get all of the relevant transcripts
        txs = annotator.retrieve_transcripts_by_genes(["MAPK1", "PIK3CA"])
        self.assertTrue(len(txs) > 3)
Exemple #9
0
    def test_simple_genes_by_gene_annotation(self):
        """Test web api backend call /gene/ """
        # http://www.broadinstitute.org/oncotator/gene/MAPK1/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        txs = annotator.retrieve_transcripts_by_genes(["MAPK1"])
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)
        self.assertTrue(len(mut_dict.keys()) == 1)
Exemple #10
0
    def test_simple_genes_by_gene_annotation(self):
        """Test web api backend call /gene/ """
        # http://www.broadinstitute.org/oncotator/gene/MAPK1/
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        txs = annotator.retrieve_transcripts_by_genes(["MAPK1"])
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)
        self.assertTrue(len(mut_dict.keys()) == 1)
    def testInitializingDatasources(self):
        """ Test initializing a database dir, both single and multicore.  This test is RAM intensive and requires default data corpus."""
        
        multiDS = DatasourceFactory.createDatasources(self.config.get("DEFAULT", "dbDir"), "hg19", isMulticore=True)
        self.assertTrue(multiDS is not None, "Datasource list was None")
        self.assertTrue(len(multiDS) != 0, "Datasource list was empty")
        for i in range(0,len(multiDS)):
            self.assertTrue(multiDS[i] is not None, "multi core datasource was None:  " + str(i))
            self.assertTrue(isinstance(multiDS[i],Datasource))

        # This test can be memory intensive, so get rid of the multiDS, but record how many datasources were created.
        numMultiDS = len(multiDS)
        del multiDS

        singleCoreDS = DatasourceFactory.createDatasources(self.config.get("DEFAULT", "dbDir"), "hg19", isMulticore=False)
        self.assertTrue(singleCoreDS is not None, "Datasource list was None")
        self.assertTrue(len(singleCoreDS) != 0, "Datasource list was empty")
        for i in range(0,len(singleCoreDS)):
            self.assertTrue(singleCoreDS[i] is not None, "single core datasource was None:  " + str(i))
            self.assertTrue(isinstance(singleCoreDS[i],Datasource))
            
        self.assertTrue(numMultiDS == len(singleCoreDS), "Length of single core datasource list was not the same as multicore")
        del singleCoreDS
Exemple #12
0
    def test_simple_genes_by_region_annotation(self):
        """Test web api backend call /genes/ """
        # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/
        # Two genes: chr22:22,112,223-22,312,558
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Here is what the API would call....
        txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558)
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)

        # Each mut will be for a separate gene
        for gene in mut_dict.keys():
            mut = mut_dict[gene]
            alt_accessions = mut["UniProt_alt_uniprot_accessions"].split("|")
            tcgascape_amp_peaks = mut["TCGAScape_Amplification_Peaks"].split("|")
            tcgascape_del_peaks = mut["TCGAScape_Deletion_Peaks"].split("|")
            tumorscape_amp_peaks = mut["TUMORScape_Amplification_Peaks"].split("|")
            tumorscape_del_peaks = mut["TUMORScape_Deletion_Peaks"].split("|")
            full_name = mut["HGNC_Approved Name"]
            cosmic = {
                "tissue_types_affected": mut["COSMIC_Tissue_tissue_types_affected"],
                "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"],
            }
            alt_aliases = list(
                itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")])
            )
            location = mut["HGNC_Chromosome"]
            uniprot_accession = mut["UniProt_uniprot_accession"]
            transcripts = mut["transcripts"]
            self.assertTrue(transcripts is not None)
            self.assertTrue(len(transcripts) > 0)
            self.assertTrue(transcripts.startswith("ENST"))
            strand = mut["strand"]
            klass = mut["class"]
            uniprot_experimentals = mut["UniProt_AA_experimental_info"].split("|")
            self.assertTrue(uniprot_experimentals is not None)
            uniprot_natural_variations = mut["UniProt_AA_natural_variation"].split("|")
            uniprot_regions = mut["UniProt_AA_region"].split("|")
            uniprot_sites = mut["UniProt_AA_site"].split("|")
            uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|")
            uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|")
            self.assertTrue(uniprot_go_cellular_components is not None)
            uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|")
            pass
Exemple #13
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(
            self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [[
                TranscriptProviderUtils.convert_genomic_space_to_exon_space(
                    exon[0] + 1, exon[1], tx)
            ] for exon in tx.get_exons()]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Exemple #14
0
    def test_simple_genes_by_region_annotation(self):
        """Test web api backend call /genes/ """
        # http://www.broadinstitute.org/oncotator/genes/chr22_22112223_22312558/
        # Two genes: chr22:22,112,223-22,312,558
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)

        # Here is what the API would call....
        txs = annotator.retrieve_transcripts_by_region("22", 22112223, 22312558)
        self.assertTranscriptsFound(txs)

        mut_dict = annotator.annotate_genes_given_txs(txs)

        # Each mut will be for a separate gene
        for gene in mut_dict.keys():
            mut = mut_dict[gene]
            alt_accessions = mut['UniProt_alt_uniprot_accessions'].split("|")
            tcgascape_amp_peaks = mut['TCGAScape_Amplification_Peaks'].split("|")
            tcgascape_del_peaks = mut['TCGAScape_Deletion_Peaks'].split("|")
            tumorscape_amp_peaks = mut['TUMORScape_Amplification_Peaks'].split("|")
            tumorscape_del_peaks = mut['TUMORScape_Deletion_Peaks'].split("|")
            full_name = mut['HGNC_Approved Name']
            cosmic = {"tissue_types_affected": mut['COSMIC_Tissue_tissue_types_affected'], "total_alterations_in_gene": mut["COSMIC_Tissue_tissue_types_affected"]}
            alt_aliases = list(itertools.chain([mut["HGNC_Previous Symbols"].split(", "), mut["HGNC_Synonyms"].split(", ")]))
            location = mut["HGNC_Chromosome"]
            uniprot_accession = mut["UniProt_uniprot_accession"]
            transcripts = mut['transcripts']
            self.assertTrue(transcripts is not None)
            self.assertTrue(len(transcripts) > 0)
            self.assertTrue(transcripts.startswith('ENST'))
            strand = mut['strand']
            klass = mut['class']
            uniprot_experimentals = mut['UniProt_AA_experimental_info'].split("|")
            self.assertTrue(uniprot_experimentals is not None)
            uniprot_natural_variations = mut['UniProt_AA_natural_variation'].split("|")
            uniprot_regions = mut['UniProt_AA_region'].split("|")
            uniprot_sites = mut['UniProt_AA_site'].split("|")
            uniprot_go_biological_processes = mut["UniProt_GO_Biological_Process"].split("|")
            uniprot_go_cellular_components = mut["UniProt_GO_Cellular_Component"].split("|")
            self.assertTrue(uniprot_go_cellular_components is not None)
            uniprot_go_molecular_functions = mut["UniProt_GO_Molecular_Function"].split("|")
            pass
Exemple #15
0
    def test_querying_transcripts_by_region(self):
        """Test web api backend call /transcripts/.... """
        datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False)
        annotator = Annotator()
        for ds in datasource_list:
            annotator.addDatasource(ds)
        txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411)
        self.assertTranscriptsFound(txs)

        ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt
        # None of these values are validated.
        for tx in txs:
            transcript_id = tx.get_transcript_id()
            tx_start = tx.determine_transcript_start()
            tx_end = tx.determine_transcript_stop()
            gene = tx.get_gene()
            chr = tx.get_contig()
            n_exons = len(tx.get_exons())
            strand = tx.get_strand()
            footprint_start, footprint_end = tx.determine_cds_footprint()
            klass = tx.get_gene_type()
            cds_start = tx.determine_cds_start()
            cds_end = tx.determine_cds_stop()
            id = tx.get_gene_id()
            genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()]
            transcript_coords = [
                [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)]
                for exon in tx.get_exons()
            ]
            code_len = int(cds_end) - int(cds_start) + 1

            # If refseq datasources are not available, this will fail.
            # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations.
            dummy_mut = annotator.annotate_transcript(tx)
            refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"]
            refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"]

            # Description is unavailable right now
            description = ""

            self.assertTrue(refseq_mRNA_id is not None)
            self.assertTrue(refseq_prot_id is not None)
            self.assertTrue(len(transcript_coords) == n_exons)
Exemple #16
0
    def create_run_spec(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=None,
                        datasourceDir=None, genomeBuild="hg19", isMulticore=False, numCores=4,
                        defaultAnnotations=None, cacheUrl=None, read_only_cache=True,
                        tx_mode=TranscriptProvider.TX_MODE_CANONICAL, is_skip_no_alts=False, other_opts=None):
        """ This is a very simple interface to start an Oncotator session.  As a warning, this interface may notbe supported in future versions.
        
        If datasourceDir is None, then the default location is used.  TODO: Define default location.
        
        IMPORTANT: Current implementation attempts to annotate using a default set of datasources.
        
        TODO: Make sure that this note above is no longer the case.  Current implementation attempts to annotate using a default set of datasources
        TODO: This method may get refactored into a separate class that handles RunConfigutaion objects. 
        """  
        # TODO: Use dependency injection for list of name value pairs?  Otherwise, set it up as an attribute on this class.
        # TODO: Use dependency injection to return instance of the input/output classes
        # TODO: Support more than the default configs.
        # TODO: On error, list the supported formats (both input and output) 
        # TODO: Make sure that we can pass in both a class and a config file, not just a class.

        globalAnnotations = dict() if globalAnnotations is None else globalAnnotations
        defaultAnnotations = dict() if defaultAnnotations is None else defaultAnnotations
        other_opts = dict() if other_opts is None else other_opts

        other_opts[InputMutationCreatorOptions.IS_SKIP_ALTS] = is_skip_no_alts

        # Step 1 Initialize input and output
        inputCreator = OncotatorCLIUtils.create_input_creator(inputFilename, inputFormat, genomeBuild, other_opts)
        outputRenderer = OncotatorCLIUtils.create_output_renderer(outputFilename, outputFormat, other_opts)

        # Step 2 Datasources
        datasourceList = DatasourceFactory.createDatasources(datasourceDir, genomeBuild, isMulticore=isMulticore, numCores=numCores, tx_mode=tx_mode)

        #TODO: Refactoring needed here to specify tx-mode (or any option not in a config file) in a cleaner way.
        for ds in datasourceList:
            if isinstance(ds, TranscriptProvider):
                logging.getLogger(__name__).info("Setting %s %s to tx-mode of %s..." % (ds.title, ds.version, tx_mode))
                ds.set_tx_mode(tx_mode)

        result = RunSpecification()
        result.initialize(inputCreator, outputRenderer, manualAnnotations=globalAnnotations, datasources=datasourceList,
                          isMulticore=isMulticore, numCores=numCores, defaultAnnotations=defaultAnnotations,
                          cacheUrl=cacheUrl, read_only_cache=read_only_cache, is_skip_no_alts=is_skip_no_alts)
        return result
    def create_run_spec(input_format, output_format, input_filename, output_filename, global_annotations=None,
                        datasource_dir=None, genomeBuild="hg19", is_multicore=False, num_cores=4,
                        default_annotations=None, cache_url=None, read_only_cache=True,
                        tx_mode=TranscriptProvider.TX_MODE_CANONICAL, is_skip_no_alts=False, other_opts=None, annotating_type=None):
        """ This is a very simple interface to start an Oncotator session.  As a warning, this interface may notbe supported in future versions.

        If datasourceDir is None, then no datasources are used

        """
        if datasource_dir:
            datasource_list = DatasourceFactory.createDatasources(datasource_dir, genomeBuild, isMulticore=is_multicore, numCores=num_cores, tx_mode=tx_mode)
        else:
            datasource_list = []

        global_annotations = dict() if global_annotations is None else global_annotations
        default_annotations = dict() if default_annotations is None else default_annotations
        other_opts = dict() if other_opts is None else other_opts

        #TODO: Refactoring needed here to specify tx-mode (or any option not in a config file) in a cleaner way.
        for ds in datasource_list:
            if isinstance(ds, TranscriptProvider):
                logging.getLogger(__name__).info("Setting %s %s to tx-mode of %s..." % (ds.title, ds.version, tx_mode))
                ds.set_tx_mode(tx_mode)

                if other_opts.get(OptionConstants.LONGER_OTHER_TX_FIELD, None) is not None:
                    ds.set_longer_other_transcripts(other_opts.get(OptionConstants.LONGER_OTHER_TX_FIELD))

                if other_opts.get(OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE, None) is not None:
                    cc_txs_filename = other_opts[OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE]
                    cc_txs_fp = file(cc_txs_filename, 'r')
                    cc_txs = [tx.rsplit(".", 1)[0] for tx in cc_txs_fp]
                    cc_txs_fp.close()
                    ds.set_custom_canonical_txs(cc_txs)
                    logging.getLogger(__name__).info(str(len(cc_txs)) + " custom canonical transcripts specified.")
                else:
                    logging.getLogger(__name__).info("No custom canonical transcripts specified.")

        return RunSpecificationFactory.create_run_spec_given_datasources(input_format, output_format, input_filename, output_filename, global_annotations,
                        datasource_list, genomeBuild, is_multicore, num_cores,
                        default_annotations, cache_url, read_only_cache,
                        tx_mode, is_skip_no_alts, other_opts, annotating_type)
    def create_run_spec(input_format, output_format, input_filename, output_filename, global_annotations=None,
                        datasource_dir=None, genomeBuild="hg19", is_multicore=False, num_cores=4,
                        default_annotations=None, cache_url=None, read_only_cache=True,
                        tx_mode=TranscriptProvider.TX_MODE_CANONICAL, is_skip_no_alts=False, other_opts=None, annotating_type=None):
        """ This is a very simple interface to start an Oncotator session.  As a warning, this interface may notbe supported in future versions.

        If datasourceDir is None, then no datasources are used

        """
        if datasource_dir:
            datasource_list = DatasourceFactory.createDatasources(datasource_dir, genomeBuild, isMulticore=is_multicore, numCores=num_cores, tx_mode=tx_mode)
        else:
            datasource_list = []

        global_annotations = dict() if global_annotations is None else global_annotations
        default_annotations = dict() if default_annotations is None else default_annotations
        other_opts = dict() if other_opts is None else other_opts

        #TODO: Refactoring needed here to specify tx-mode (or any option not in a config file) in a cleaner way.
        for ds in datasource_list:
            if isinstance(ds, TranscriptProvider):
                logging.getLogger(__name__).info("Setting %s %s to tx-mode of %s..." % (ds.title, ds.version, tx_mode))
                ds.set_tx_mode(tx_mode)

                if other_opts.get(OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE, None) is not None:
                    cc_txs_filename = other_opts[OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE]
                    cc_txs_fp = file(cc_txs_filename, 'r')
                    cc_txs = [tx.rsplit(".", 1)[0] for tx in cc_txs_fp]
                    cc_txs_fp.close()
                    ds.set_custom_canonical_txs(cc_txs)
                    logging.getLogger(__name__).info(str(len(cc_txs)) + " custom canonical transcripts specified.")
                else:
                    logging.getLogger(__name__).info("No custom canonical transcripts specified.")

        return RunSpecificationFactory.create_run_spec_given_datasources(input_format, output_format, input_filename, output_filename, global_annotations,
                        datasource_list, genomeBuild, is_multicore, num_cores,
                        default_annotations, cache_url, read_only_cache,
                        tx_mode, is_skip_no_alts, other_opts, annotating_type)
Exemple #19
0
 def _createDatasourceCorpus(self):
     dbDir = self.config.get('DEFAULT', "dbDir")
     return DatasourceFactory.createDatasources(dbDir, "hg19", isMulticore=False)
Exemple #20
0
    def create_run_spec(inputFormat,
                        outputFormat,
                        inputFilename,
                        outputFilename,
                        globalAnnotations=None,
                        datasourceDir=None,
                        genomeBuild="hg19",
                        isMulticore=False,
                        numCores=4,
                        defaultAnnotations=None,
                        cacheUrl=None,
                        read_only_cache=True,
                        tx_mode=TranscriptProvider.TX_MODE_CANONICAL,
                        is_skip_no_alts=False,
                        other_opts=None,
                        annotating_type=None):
        """ This is a very simple interface to start an Oncotator session.  As a warning, this interface may notbe supported in future versions.

        If datasourceDir is None, then the default location is used.  TODO: Define default location.

        IMPORTANT: Current implementation attempts to annotate using a default set of datasources.

        TODO: Make sure that this note above is no longer the case.  Current implementation attempts to annotate using a default set of datasources
        TODO: This method may get refactored into a separate class that handles RunConfigutaion objects.
        """
        # TODO: Use dependency injection for list of name value pairs?  Otherwise, set it up as an attribute on this class.
        # TODO: Use dependency injection to return instance of the input/output classes

        globalAnnotations = dict(
        ) if globalAnnotations is None else globalAnnotations
        defaultAnnotations = dict(
        ) if defaultAnnotations is None else defaultAnnotations
        other_opts = dict() if other_opts is None else other_opts

        other_opts[InputMutationCreatorOptions.IS_SKIP_ALTS] = is_skip_no_alts

        # Step 0 Validate given parameters and log messages.  If an error or critical is found, throw an exception.
        validation_messages = RunSpecificationFactory._validate_run_spec_parameters(
            inputFormat, outputFormat, inputFilename, outputFilename,
            globalAnnotations, datasourceDir, genomeBuild, isMulticore,
            numCores, defaultAnnotations, cacheUrl, read_only_cache, tx_mode,
            is_skip_no_alts, other_opts, annotating_type)
        for msg in validation_messages:
            logging.getLogger(__name__).log(msg.level, msg.message)
            if (msg.level == logging.ERROR) or (msg.level == logging.CRITICAL):
                raise RunSpecificationException(msg.message)

        # Step 1 Initialize input and output
        inputCreator = OncotatorCLIUtils.create_input_creator(
            inputFilename, inputFormat, genomeBuild, other_opts)
        outputRenderer = OncotatorCLIUtils.create_output_renderer(
            outputFilename, outputFormat, other_opts)

        # Step 2 Datasources
        if datasourceDir:
            datasource_list = DatasourceFactory.createDatasources(
                datasourceDir,
                genomeBuild,
                isMulticore=isMulticore,
                numCores=numCores,
                tx_mode=tx_mode)
        else:
            datasource_list = []

        #TODO: Refactoring needed here to specify tx-mode (or any option not in a config file) in a cleaner way.
        for ds in datasource_list:
            if isinstance(ds, TranscriptProvider):
                logging.getLogger(__name__).info(
                    "Setting %s %s to tx-mode of %s..." %
                    (ds.title, ds.version, tx_mode))
                ds.set_tx_mode(tx_mode)

                if other_opts.get(
                        OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE,
                        None) is not None:
                    cc_txs_fp = file(
                        other_opts[
                            OptionConstants.CUSTOM_CANONICAL_TX_LIST_FILE],
                        'r')
                    cc_txs = [tx.rsplit(".", 1)[0] for tx in cc_txs_fp]
                    cc_txs_fp.close()
                    ds.set_custom_canonical_txs(cc_txs)
                    logging.getLogger(__name__).info(
                        str(len(cc_txs)) +
                        " custom canonical transcripts specified.")
                else:
                    logging.getLogger(__name__).info(
                        "No custom canonical transcripts specified.")

        result = RunSpecification()
        result.initialize(inputCreator,
                          outputRenderer,
                          manualAnnotations=globalAnnotations,
                          datasources=datasource_list,
                          isMulticore=isMulticore,
                          numCores=numCores,
                          defaultAnnotations=defaultAnnotations,
                          cacheUrl=cacheUrl,
                          read_only_cache=read_only_cache,
                          is_skip_no_alts=is_skip_no_alts,
                          annotating_type=annotating_type)
        return result
 def testMulticoreNoDatasources(self):
     """ If using multicore, does not hang when no datasources are in the db dir"""
     multiDS = DatasourceFactory.createDatasources('testdata/maflite/', "hg19", True)
     self.assertTrue(len(multiDS) == 0, "Length of multiDS when there were no datasources was not zero.")