Example #1
0
 def __init__(self):
     """
     options should contain the following name-value pairs as a dict: 
     
     Create a new instance of Annotator.
     
     In order to specify the input and output creators and datasources, use the set and addDatasource methods.
     
     """
     self._inputCreator = None
     self._outputRenderer = None
     self._datasources = []
     self.logger = logging.getLogger(__name__)
     self._manualAnnotations = dict()
     self._defaultAnnotations = dict()
     self._isMulticore = None
     self._numCores = None
     self._cacheManager = CacheManager()
     self._cacheManager.initialize(None, "not_used")
     self._cache_stats = {"miss": 0, "hit": 0}
     self._is_skip_no_alts = False
     self._annotating_type = None
     self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(
         self._annotating_type, _annotate_mut)
     pass
Example #2
0
 def initialize_cache_manager(self, runSpec):
     """Do not bother calculating the db_dir_key if the cache is not being used. """
     cache_url = runSpec.get_cache_url()
     if cache_url is not None and cache_url != "":
         db_dir_key = self.create_db_dir_key()
         self._cacheManager = CacheManager()
         self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache())
     else:
         db_dir_key = "never_used"
         self._cacheManager = None
Example #3
0
 def test_cached_annots_dummy_cache(self):
     """Test dummy cache.  Also, tests a simple store and retrieve, which should be None."""
     cm = CacheManager()
     fake_db_dir_key = "blah"
     cm.initialize(None, fake_db_dir_key, is_read_only=False)
     m = MutationDataFactory.default_create()
     m.createAnnotation("blah1", "val1", annotationSource="INPUT")
     m.createAnnotation("blah2", "val5", annotationSource="some_datasource")
     cm.store_annotations_in_cache(m)
     annots = cm.retrieve_cached_annotations(m)
     self.assertTrue(annots is None)
Example #4
0
 def test_cached_annots_dummy_cache(self):
     """Test dummy cache.  Also, tests a simple store and retrieve, which should be None."""
     cm = CacheManager()
     fake_db_dir_key = "blah"
     cm.initialize(None, fake_db_dir_key, is_read_only=False)
     m = MutationData()
     m.createAnnotation("blah1", "val1", annotationSource="INPUT")
     m.createAnnotation("blah2", "val5", annotationSource="some_datasource")
     cm.store_annotations_in_cache(m)
     annots = cm.retrieve_cached_annotations(m)
     self.assertTrue(annots is None)
Example #5
0
    def test_cached_annots(self):
        """Test to make sure that we are not storing annotations that should not be cached.  Also, tests a simple store and retrieve."""
        cache_file = "out/shove.managertest.annots.cache"
        cm = CacheManager()
        fake_db_dir_key = "blah"
        cm.initialize("file://" + cache_file,
                      fake_db_dir_key,
                      is_read_only=False)
        m = MutationDataFactory.default_create()
        m.createAnnotation("blah1", "val1", annotationSource="INPUT")
        m.createAnnotation("blah2", "val5", annotationSource="some_datasource")
        cm.store_annotations_in_cache(m)
        annots = cm.retrieve_cached_annotations(m)

        self.assertTrue(len(annots.keys()) == 1)
        self.assertTrue(annots["blah2"].getValue() == "val5")
Example #6
0
    def test_cached_annots(self):
        """Test to make sure that we are not storing annotations that should not be cached.  Also, tests a simple store and retrieve."""
        cache_file = "out/shove.managertest.annots.cache"
        cm = CacheManager()
        fake_db_dir_key = "blah"
        cm.initialize("file://" + cache_file, fake_db_dir_key, is_read_only=False)
        m = MutationData()
        m.createAnnotation("blah1", "val1", annotationSource="INPUT")
        m.createAnnotation("blah2", "val5", annotationSource="some_datasource")
        cm.store_annotations_in_cache(m)
        annots = cm.retrieve_cached_annotations(m)

        self.assertTrue(len(annots.keys()) == 1)
        self.assertTrue(annots["blah2"].getValue() == "val5")
Example #7
0
 def __init__(self):
     """
     options should contain the following name-value pairs as a dict: 
     
     Create a new instance of Annotator.
     
     In order to specify the input and output creators and datasources, use the set and addDatasource methods.
     
     """
     self._inputCreator = None
     self._outputRenderer = None
     self._datasources = []
     self.logger = logging.getLogger(__name__)
     self._manualAnnotations = dict()
     self._defaultAnnotations = dict()
     self._isMulticore = None
     self._numCores = None
     self._cacheManager = CacheManager()
     self._cacheManager.initialize(None, "not_used")
     self._cache_stats = {"miss": 0, "hit":0}
     self._is_skip_no_alts = False
     pass
Example #8
0
class Annotator(object):
    """
    The Annotator is the entry point to actually perform the annotating of mutations.  The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources.   
    
    This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources).
    
    For information on how to initialize the input and output creator, please see the documentation of those classes.

    See the RunSpecification class, which allows for more control
        of an annotator.

    Example usage (with RunSpec and no multicore usage):
    # Create a run configuration to pass to the Annotator class.  See OncotatorCLIUtils.getSupportedOutputFormats()
    #   and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings.
    manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'}
    runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False)

    annotator = Annotator()
    annotator.initialize(runConfig)
    annotator.annotate()

    Example usage (used in testing, without RunSpec):
        # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively.
        # 1) Initialize the Annotator
        annotator = Annotator()
        annotator.setInputCreator(myIC)
        annotator.setOutputCreator(myOC)
        # 1a) For each datasource (instance of a datasource class), add it to the annotator.
        for datasource in myDataSources:
            annotator.addDatasource(datasource)
        # 2)  Produce the output
        filePointer = annotator.annotate()
    
    NOTE:  While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore.

    NOTE: If we ever change attributes on a TranscriptProvider in the middle of an annotation run, we will need to re-generate the md5
    """

    ANNOTATING_FUNC_DICT = {
        RunSpecification.ANNOTATE_MUTATIONS: _annotate_mut,
        RunSpecification.ANNOTATE_SEGMENTS: _annotate_seg
    }
    ANNOTATING_DS_DICT = {
        RunSpecification.ANNOTATE_MUTATIONS: Datasource,
        RunSpecification.ANNOTATE_SEGMENTS: SegmentDatasource
    }

    def __init__(self):
        """
        options should contain the following name-value pairs as a dict: 
        
        Create a new instance of Annotator.
        
        In order to specify the input and output creators and datasources, use the set and addDatasource methods.
        
        """
        self._inputCreator = None
        self._outputRenderer = None
        self._datasources = []
        self.logger = logging.getLogger(__name__)
        self._manualAnnotations = dict()
        self._defaultAnnotations = dict()
        self._isMulticore = None
        self._numCores = None
        self._cacheManager = CacheManager()
        self._cacheManager.initialize(None, "not_used")
        self._cache_stats = {"miss": 0, "hit": 0}
        self._is_skip_no_alts = False
        self._annotating_type = None
        self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(
            self._annotating_type, _annotate_mut)
        pass

    def getIsMulticore(self):
        return self.__isMulticore

    def getNumCores(self):
        return self.__numCores

    def setIsMulticore(self, value):
        self.__isMulticore = value

    def setNumCores(self, value):
        self.__numCores = value

    def setInputCreator(self, inputCreator):
        self._inputCreator = inputCreator

    def setOutputRenderer(self, outputCreator):
        self._outputRenderer = outputCreator

    def setManualAnnotations(self, value):
        self._manualAnnotations = value

    def setDefaultAnnotations(self, value):
        self._defaultAnnotations = value

    def set_annotating_type(self, value):
        self._annotating_type = value

    def create_db_dir_key(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        self.logger.info("Generating db-dir key from datasources...")
        hasher = Hasher()
        for ds in self._datasources:
            self.logger.info(ds.title + " " + ds.version + " md5: " +
                             ds.get_hashcode())
            hasher.update(ds.get_hashcode())
        db_dir_key = Hasher.md5_hash(hasher.hexdigest())
        self.logger.info("Final db-dir md5: " + db_dir_key)
        return db_dir_key

    def create_db_dir_key_simple(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        db_dir_key = Hasher.md5_hash(self.createHeaderString(False))
        return db_dir_key

    def initialize_cache_manager(self, runSpec):
        """Do not bother calculating the db_dir_key if the cache is not being used. """
        cache_url = runSpec.get_cache_url()
        if cache_url is not None and cache_url != "":
            db_dir_key = self.create_db_dir_key()
            self._cacheManager = CacheManager()
            self._cacheManager.initialize(
                cache_url,
                db_dir_key,
                is_read_only=runSpec.get_is_read_only_cache())
        else:
            db_dir_key = "never_used"
            self._cacheManager = None

    def initialize(self, run_spec):
        """ Given a RunSpecification instance, initialize self properly.  Do not start annotation.
        """
        self.setInputCreator(run_spec.inputCreator)
        self.setOutputRenderer(run_spec.outputRenderer)
        self.setManualAnnotations(run_spec.manualAnnotations)
        self.setDefaultAnnotations(run_spec.defaultAnnotations)
        self._datasources = run_spec.datasources
        self.setIsMulticore(run_spec.get_is_multicore())
        self.setNumCores(run_spec.get_num_cores())
        self._cache_stats = {"miss": 0, "hit": 0}
        self._is_skip_no_alts = run_spec.get_is_skip_no_alts()
        self.initialize_cache_manager(run_spec)
        self.set_annotating_type(run_spec.annotating_type)
        self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(
            self._annotating_type, _annotate_mut)

    def addDatasource(self, datasource):
        self._datasources.append(datasource)

    def _createMetadata(self):
        metadata = self._inputCreator.getMetadata()
        metadata.update(
            self._createManualAnnotationsForMetadata(self._manualAnnotations))
        return metadata

    def _createComments(self):
        comments = self._inputCreator.getComments()
        comments.append(self.createHeaderString())
        return comments

    def retrieve_transcript_by_id(self, transcript_id):
        # Get the Transcript Datasource
        if self._datasources is None or len(self._datasources) == 0:
            logging.getLogger(__name__).warn(
                "Attempting to retrieve transcripts, but no datasources are initialized."
            )

        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                return ds.get_transcript(transcript_id)
        return None

    def retrieve_transcripts_by_genes(self, genes):
        """
        Given names of genes, return all transcripts

        Datasources, particularly a TranscriptDatasource should be initialized before calling this method.

        :param list genes:List of str gene names
        :returns list: List of Transcripts
        """
        # Get the Transcript Datasource
        if self._datasources is None or len(self._datasources) == 0:
            logging.getLogger(__name__).warn(
                "Attempting to retrieve transcripts by gene, but no datasources are initialized."
            )
        txs = []
        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                for gene in genes:
                    txs.extend(ds.retrieve_transcripts_by_gene(gene))
        return txs

    def retrieve_transcripts_by_region(self, chrom, start, end):
        """
        Finds all TrnascriptProviders and gets the transcripts in the given genomic region (in genomic coords)

        :rtype : list
        :param chrom:
        :param start:
        :param end:
        """
        txs = []
        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                txs.extend(ds.get_transcripts_by_pos(chrom, start, end))
        return txs

    def annotate_transcript(self, tx):
        """
        Given a transcript, get all transcript annotations on a mutation.

        HACK: Looks only for GenericTranscriptDatasources
        HACK: Not actually annotating a transcript.  Creates a dummy mutation and then only looks to annotate with
            GenericTranscriptDatasources

        :param Transcript tx: transcript to annotate
        :returns MutationData: mutation with annotations generated from the given transcript
        """
        m = MutationData()
        m['transcript_id'] = tx.get_transcript_id()

        for ds in self._datasources:
            if isinstance(ds, GenericTranscriptDatasource):
                m = ds.annotate_mutation(m)

        return m

    def _annotate_genes(self, muts):
        """
        Given a set of mutations (with the gene annotation), annotate with values from relevant datasources.

        :param muts: iterable of MutationData
        :rtype : None
         HACK: "relevant" is simply GenericGeneDatasources

         mutations are annotated in place.

        """
        for m in muts:
            for ds in self._datasources:
                if isinstance(ds, GenericGeneDatasource) or \
                        isinstance(ds, GenericGeneProteinPositionDatasource):
                    m = ds.annotate_mutation(m)

    def annotate_genes_given_txs(self, txs):
        gene_to_tx_dict = {}
        for tx in txs:
            try:
                gene_to_tx_dict[tx.get_gene()].append(tx)
            except KeyError:
                gene_to_tx_dict[tx.get_gene()] = [tx]

        genes = set(gene_to_tx_dict.keys())
        genes = sorted(list(genes))
        muts_dict = {}
        for gene in genes:
            m = MutationData()
            m.createAnnotation("gene", gene)
            m.createAnnotation(
                "transcripts", ",".join(
                    sorted([
                        tx.get_transcript_id() for tx in gene_to_tx_dict[gene]
                    ])))
            m.createAnnotation("strand", gene_to_tx_dict[gene][0].get_strand())
            m.createAnnotation("class",
                               gene_to_tx_dict[gene][0].get_gene_type())
            endAA = str(
                max([
                    len(tx.get_protein_seq()) for tx in gene_to_tx_dict[gene]
                ]))
            m.createAnnotation("protein_change", "p.DUMMY1_" + endAA)
            m.createAnnotation("chr", gene_to_tx_dict[gene][0].get_contig())
            muts_dict[gene] = m

        self._annotate_genes(muts_dict.values())
        return muts_dict

    def annotate_mutations(self, mutations):
        """
        Given a list of mutations (or any iterable of mutations), return a list of annotated mutations.

        :rtype : list
        :param mutations: iterator of MutationData
        """
        mutations = self._annotate_mutations_using_datasources(mutations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after annotation.")

        mutations = self._applyDefaultAnnotations(mutations,
                                                  self._defaultAnnotations)
        if mutations is None:
            self.logger.warn(
                "Mutation list points to None after default annotations.")

        mutations = self._applyManualAnnotations(mutations,
                                                 self._manualAnnotations)
        if mutations is None:
            self.logger.warn(
                "Mutation list points to None after manual annotations.")

        return mutations

    def _prune_datasources_by_annotating_type(self):
        # Remove datasources that do not match the annotation type (segment or mutation)
        datasource_class = Annotator.ANNOTATING_DS_DICT.get(
            self._annotating_type, RunSpecification.ANNOTATE_MUTATIONS)
        pruned_ds = []
        for ds in self._datasources:
            if not isinstance(ds, datasource_class):
                logging.getLogger(__name__).info(
                    "Removing %s, since it does not support annotating %s" %
                    (ds.title, str(self._annotating_type)))
            else:
                pruned_ds.append(ds)
        return pruned_ds

    def annotate(self):
        """
        Annotate the given mutations specified in the input.

        Call this after the input, output, and datasources have been set.

        :return: outputFilename
        """

        if self._annotating_type is None:
            self._annotating_type = RunSpecification.ANNOTATE_MUTATIONS

        self._datasources = self._prune_datasources_by_annotating_type()

        self.logger.info("Annotating with " + str(len(self._datasources)) +
                         " datasources: " + self.createHeaderString())

        mutations = self._inputCreator.createMutations()
        if mutations is None:
            self.logger.warn("Mutation list points to None after creation.")

        mutations = self.annotate_mutations(mutations)

        comments = self._createComments()
        metadata = self._createMetadata()

        filename = self._outputRenderer.renderMutations(mutations,
                                                        metadata=metadata,
                                                        comments=comments)

        if self._cacheManager is not None:
            self.logger.info("Closing cache: (misses: " +
                             str(self._cache_stats['miss']) + "  hits: " +
                             str(self._cache_stats['hit']) + ")")
            self._cacheManager.close_cache()

        return filename

    def _applyManualAnnotations(self, mutations, manualAnnotations):
        manualAnnotationKeys = manualAnnotations.keys()

        for m in mutations:
            for k in manualAnnotationKeys:
                # newRequired = False allows this call to overwrite the previous value.
                m.createAnnotation(k,
                                   manualAnnotations[k],
                                   annotationSource="MANUAL",
                                   newRequired=False)
            yield m

    def _applyDefaultAnnotations(self, mutations, defaultAnnotations):
        defaultAnnotationsKeys = defaultAnnotations.keys()
        for m in mutations:
            mKeys = m.keys()
            for k in defaultAnnotationsKeys:
                if k not in mKeys:
                    m.createAnnotation(k,
                                       defaultAnnotations[k],
                                       annotationSource="DEFAULT")
                if m[k] == "":
                    m.getAnnotation(k).setDatasource("DEFAULT")
                    m.getAnnotation(k).setValue(defaultAnnotations[k])
            yield m

    def _createManualAnnotationsForMetadata(self, manualAnnotations):
        result = {}
        manualAnnotationKeys = manualAnnotations.keys()
        for k in manualAnnotationKeys:
            result[k] = Annotation(manualAnnotations[k],
                                   datasourceName="MANUAL")
        return result

    def createHeaderString(self, is_giving_oncotator_version=True):
        """
        Create a default header string that lists version of Oncotator and datasource information.

        :return str: header string with the "|" delimiter
        """
        onco_string = ""
        if is_giving_oncotator_version:
            onco_string = "Oncotator " + VERSION + " |"

        datasourceStrings = []
        for ds in self._datasources:
            tx_mode_str = ""
            if isinstance(ds, TranscriptProvider):
                tx_mode_str = ds.get_tx_mode() + " "
            datasourceStrings.append(" " + ds.title + " " + ds.version + " " +
                                     tx_mode_str)

        return onco_string + "|".join(datasourceStrings)

    def _annotate_mutations_using_datasources(self, mutations):
        """
        Perform the actual annotating of mutations with the datasources.  Make sure to check the cache as well.

        :param MutationData mutations: iterable of MutationData
        :return generator : MutationData generator
        """
        if len(self._datasources) == 0:
            self.logger.warn("THERE ARE NO DATASOURCES REGISTERED")

        is_cache_being_used = (self._cacheManager is not None)

        for m in mutations:

            # If the alt_allele_seen annotation is present and False, skip this mutation
            if self._is_skip_no_alts and m.get("alt_allele_seen",
                                               "True") == "False":
                continue

            cache_annot_dict = None
            if is_cache_being_used:
                cache_annot_dict = self._cacheManager.retrieve_cached_annotations(
                    m)

            # If no cache results were found, annotate normally.
            if cache_annot_dict is None:
                for datasource in self._datasources:

                    # This will evaluate to datasource.annotate_mutation(m) or datasource.annotate_segment(m)
                    m = self._annotate_func_ptr(m, datasource)

                if is_cache_being_used:
                    self._cache_stats['miss'] += 1
                    self._cacheManager.store_annotations_in_cache(m)
            else:
                self._cache_stats['hit'] += 1
                m.addAnnotations(cache_annot_dict)
            yield m
Example #9
0
class Annotator(object):
    """
    The Annotator is the entry point to actually perform the annotating of mutations.  The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources.   
    
    This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources).
    
    For information on how to initialize the input and output creator, please see the documentation of those classes.

    See the RunSpecification class, which allows for more control
        of an annotator.

    Example usage (with RunSpec and no multicore usage):
    # Create a run configuration to pass to the Annotator class.  See OncotatorCLIUtils.getSupportedOutputFormats()
    #   and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings.
    manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'}
    runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False)

    annotator = Annotator()
    annotator.initialize(runConfig)
    annotator.annotate()

    Example usage (used in testing, without RunSpec):
        # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively.
        # 1) Initialize the Annotator
        annotator = Annotator()
        annotator.setInputCreator(myIC)
        annotator.setOutputCreator(myOC)
        # 1a) For each datasource (instance of a datasource class), add it to the annotator.
        for datasource in myDataSources:
            annotator.addDatasource(datasource)
        # 2)  Produce the output
        filePointer = annotator.annotate()
    
    NOTE:  While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore.

    NOTE: If we ever change attributes on a TranscriptProvider in the middle of an annotation run, we will need to re-generate the md5
    """

    ANNOTATING_FUNC_DICT = {RunSpecification.ANNOTATE_MUTATIONS: _annotate_mut, RunSpecification.ANNOTATE_SEGMENTS: _annotate_seg}
    ANNOTATING_DS_DICT = {RunSpecification.ANNOTATE_MUTATIONS: Datasource, RunSpecification.ANNOTATE_SEGMENTS: SegmentDatasource}

    def __init__(self):
        """
        options should contain the following name-value pairs as a dict: 
        
        Create a new instance of Annotator.
        
        In order to specify the input and output creators and datasources, use the set and addDatasource methods.
        
        """
        self._inputCreator = None
        self._outputRenderer = None
        self._datasources = []
        self.logger = logging.getLogger(__name__)
        self._manualAnnotations = dict()
        self._defaultAnnotations = dict()
        self._isMulticore = None
        self._numCores = None
        self._cacheManager = CacheManager()
        self._cacheManager.initialize(None, "not_used")
        self._cache_stats = {"miss": 0, "hit":0}
        self._is_skip_no_alts = False
        self._annotating_type = None
        self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(self._annotating_type, _annotate_mut)
        self._is_allow_annotation_overwriting = None

    def getIsMulticore(self):
        return self.__isMulticore

    def getNumCores(self):
        return self.__numCores

    def setIsMulticore(self, value):
        self.__isMulticore = value

    def setNumCores(self, value):
        self.__numCores = value

    def setInputCreator(self, inputCreator):
        self._inputCreator = inputCreator
        
    def setOutputRenderer(self, outputCreator):
        self._outputRenderer = outputCreator
    
    def setManualAnnotations(self, value):
        self._manualAnnotations = value

    def setDefaultAnnotations(self, value):
        self._defaultAnnotations = value

    def set_annotating_type(self, value):
        self._annotating_type = value

    def create_db_dir_key(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        self.logger.info("Generating db-dir key from datasources...")
        hasher = Hasher()
        for ds in self._datasources:
            self.logger.info(ds.title + " " + ds.version + " md5: " + ds.get_hashcode())
            hasher.update(ds.get_hashcode())
        db_dir_key = Hasher.md5_hash(hasher.hexdigest())
        self.logger.info("Final db-dir md5: " + db_dir_key)
        return db_dir_key

    def create_db_dir_key_simple(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        db_dir_key = Hasher.md5_hash(self.createHeaderString(False))
        return db_dir_key

    def initialize_cache_manager(self, runSpec):
        """Do not bother calculating the db_dir_key if the cache is not being used. """
        cache_url = runSpec.get_cache_url()
        if cache_url is not None and cache_url != "":
            db_dir_key = self.create_db_dir_key()
            self._cacheManager = CacheManager()
            self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache())
        else:
            db_dir_key = "never_used"
            self._cacheManager = None

    def initialize(self, run_spec):
        """ Given a RunSpecification instance, initialize self properly.  Do not start annotation.
        """
        self.setInputCreator(run_spec.inputCreator)
        self.setOutputRenderer(run_spec.outputRenderer)
        self.setManualAnnotations(run_spec.manualAnnotations)
        self.setDefaultAnnotations(run_spec.defaultAnnotations)
        self._datasources = run_spec.datasources
        self.setIsMulticore(run_spec.get_is_multicore())
        self.setNumCores(run_spec.get_num_cores())
        self._cache_stats = {"miss": 0, "hit":0}
        self._is_skip_no_alts = run_spec.get_is_skip_no_alts()
        self.initialize_cache_manager(run_spec)
        self.set_annotating_type(run_spec.annotating_type)
        self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(self._annotating_type, _annotate_mut)
        self._is_allow_annotation_overwriting = run_spec.is_allow_annotation_overwriting
        self._mutation_data_factory = MutationDataFactory(allow_overwriting=self._is_allow_annotation_overwriting)

    def addDatasource(self, datasource):
        self._datasources.append(datasource)

    def _createMetadata(self):
        metadata = self._inputCreator.getMetadata()
        metadata.update(self._createManualAnnotationsForMetadata(self._manualAnnotations))
        return metadata

    def _createComments(self):
        comments = self._inputCreator.getComments()
        comments.append(self.createHeaderString())
        return comments

    def retrieve_transcript_by_id(self, transcript_id):
        # Get the Transcript Datasource
        if self._datasources is None or len(self._datasources) == 0:
            logging.getLogger(__name__).warn("Attempting to retrieve transcripts, but no datasources are initialized.")

        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                return ds.get_transcript(transcript_id)
        return None

    def retrieve_transcripts_by_genes(self, genes):
        """
        Given names of genes, return all transcripts

        Datasources, particularly a TranscriptDatasource should be initialized before calling this method.

        :param list genes:List of str gene names
        :returns list: List of Transcripts
        """
        # Get the Transcript Datasource
        if self._datasources is None or len(self._datasources) == 0:
            logging.getLogger(__name__).warn("Attempting to retrieve transcripts by gene, but no datasources are initialized.")
        txs = []
        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                for gene in genes:
                    txs.extend(ds.retrieve_transcripts_by_gene(gene))
        return txs

    def retrieve_transcripts_by_region(self, chrom, start, end):
        """
        Finds all TrnascriptProviders and gets the transcripts in the given genomic region (in genomic coords)

        :rtype : list
        :param chrom:
        :param start:
        :param end:
        """
        txs = []
        for ds in self._datasources:
            if isinstance(ds, TranscriptProvider):
                txs.extend(ds.get_transcripts_by_pos(chrom, start, end))
        return txs

    def annotate_transcript(self, tx):
        """
        Given a transcript, get all transcript annotations on a mutation.

        HACK: Looks only for GenericTranscriptDatasources
        HACK: Not actually annotating a transcript.  Creates a dummy mutation and then only looks to annotate with
            GenericTranscriptDatasources

        :param Transcript tx: transcript to annotate
        :returns MutationData: mutation with annotations generated from the given transcript
        """
        m = MutationDataFactory.default_create()
        m['transcript_id'] = tx.get_transcript_id()

        for ds in self._datasources:
            if isinstance(ds, GenericTranscriptDatasource):
                m = ds.annotate_mutation(m)

        return m

    def _annotate_genes(self, muts):
        """
        Given a set of mutations (with the gene annotation), annotate with values from relevant datasources.

        :param muts: iterable of MutationData
        :rtype : None
         HACK: "relevant" is simply GenericGeneDatasources

         mutations are annotated in place.

        """
        for m in muts:
            for ds in self._datasources:
                if isinstance(ds, GenericGeneDatasource) or \
                        isinstance(ds, GenericGeneProteinPositionDatasource):
                    m = ds.annotate_mutation(m)

    def annotate_genes_given_txs(self, txs):
        """
        Given a list of Transcripts, create and annotate dummy mutations that represent only the gene.

        :param txs: list of Transcripts
        :type txs: list
        :return:
        """
        gene_to_tx_dict = {}
        for tx in txs:
            try:
                gene_to_tx_dict[tx.get_gene()].append(tx)
            except KeyError:
                gene_to_tx_dict[tx.get_gene()] = [tx]

        genes = set(gene_to_tx_dict.keys())
        genes = sorted(list(genes))
        muts_dict = {}
        for gene in genes:
            m = MutationDataFactory.default_create()
            m.createAnnotation("gene", gene)
            m.createAnnotation("transcripts", ",".join(sorted([tx.get_transcript_id() for tx in gene_to_tx_dict[gene]])))
            m.createAnnotation("strand", gene_to_tx_dict[gene][0].get_strand())
            m.createAnnotation("class", gene_to_tx_dict[gene][0].get_gene_type())
            endAA = str(max([len(tx.get_protein_seq()) for tx in gene_to_tx_dict[gene]]))
            m.createAnnotation("protein_change", "p.DUMMY1_" + endAA)
            m.createAnnotation("chr", gene_to_tx_dict[gene][0].get_contig())

            # Annotate each transcript and collapse the relevant transcript annotations for each gene.
            tx_muts_uncollapsed = [self.annotate_transcript(tx) for tx in gene_to_tx_dict[gene]]
            annotation_vals_collapsed = defaultdict(set)
            for tx_mut in tx_muts_uncollapsed:
                for annotation_name in tx_mut.keys():

                    # For every annotation on the dummy transcript (tx_mut), create a dictionary containing a
                    #  set of values.
                    # Only consider annotations that are not INPUT and the datasource is known.
                    invalid_annotation_sources = ["INPUT", "OUTPUT", "Unknown"]
                    if tx_mut.getAnnotation(annotation_name).getDatasource() not in invalid_annotation_sources:
                        annotation_vals_collapsed[annotation_name].add(tx_mut[annotation_name])

            # Create a new annotation that encompasses the transcript data for the gene.
            for new_annotation in annotation_vals_collapsed.keys():

                # Remove blank values from the set
                annotation_val_collapsed_set = annotation_vals_collapsed[new_annotation] - set([""])

                str_val = "|".join(sorted(list(annotation_val_collapsed_set)))
                m.createAnnotation(new_annotation, str_val, annotationSource="OUTPUT")

            muts_dict[gene] = m

        self._annotate_genes(muts_dict.values())
        return muts_dict


    def annotate_mutations(self, mutations):
        """
        Given a list of mutations (or any iterable of mutations), return a list of annotated mutations.

        :rtype : list
        :param mutations: iterator of MutationData
        """
        mutations = self._annotate_mutations_using_datasources(mutations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after annotation.")

        mutations = self._applyDefaultAnnotations(mutations, self._defaultAnnotations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after default annotations.")

        mutations = self._applyManualAnnotations(mutations, self._manualAnnotations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after manual annotations.")

        return mutations

    def _prune_datasources_by_annotating_type(self):
        # Remove datasources that do not match the annotation type (segment or mutation)
        datasource_class = Annotator.ANNOTATING_DS_DICT.get(self._annotating_type, RunSpecification.ANNOTATE_MUTATIONS)
        pruned_ds = []
        for ds in self._datasources:
            if not isinstance(ds, datasource_class):
                logging.getLogger(__name__).info(
                    "Removing %s, since it does not support annotating %s" % (ds.title, str(self._annotating_type)))
            else:
                pruned_ds.append(ds)
        return pruned_ds

    def annotate(self):
        """
        Annotate the given mutations specified in the input.

        Call this after the input, output, and datasources have been set.

        :return: outputFilename
        """

        if self._annotating_type is None:
            self._annotating_type = RunSpecification.ANNOTATE_MUTATIONS

        self._datasources = self._prune_datasources_by_annotating_type()

        self.logger.info("Annotating with " + str(len(self._datasources)) + " datasources: " + self.createHeaderString())
        
        mutations = self._inputCreator.createMutations()
        if mutations is None: 
            self.logger.warn("Mutation list points to None after creation.")

        mutations = self.annotate_mutations(mutations)

        comments = self._createComments()
        metadata = self._createMetadata()

        filename = self._outputRenderer.renderMutations(mutations, metadata=metadata, comments=comments)

        if self._cacheManager is not None:
            self.logger.info("Closing cache: (misses: " + str(self._cache_stats['miss']) + "  hits: " + str(self._cache_stats['hit']) + ")")
            self._cacheManager.close_cache()

        return filename
    
    def _applyManualAnnotations(self, mutations, manualAnnotations):
        manualAnnotationKeys = manualAnnotations.keys()

        for m in mutations:
            for k in manualAnnotationKeys:
                # newRequired = False allows this call to overwrite the previous value.
                m.createAnnotation(k, manualAnnotations[k], annotationSource="MANUAL", newRequired=False)
            yield m

    def _applyDefaultAnnotations(self, mutations, defaultAnnotations):
        defaultAnnotationsKeys = defaultAnnotations.keys()
        for m in mutations:
            mKeys = m.keys()
            for k in defaultAnnotationsKeys:
                if k not in mKeys:
                    m.createAnnotation(k, defaultAnnotations[k], annotationSource="DEFAULT")
                if m[k] == "":
                    m.getAnnotation(k).setDatasource("DEFAULT")
                    m.getAnnotation(k).setValue(defaultAnnotations[k])
            yield m

    def _createManualAnnotationsForMetadata(self, manualAnnotations):
        result = {}
        manualAnnotationKeys = manualAnnotations.keys()
        for k in manualAnnotationKeys:
            result[k] = Annotation(manualAnnotations[k], datasourceName="MANUAL")
        return result

    def createHeaderString(self, is_giving_oncotator_version=True):
        """
        Create a default header string that lists version of Oncotator and datasource information.

        :return str: header string with the "|" delimiter
        """
        onco_string = ""
        if is_giving_oncotator_version:
            onco_string = "Oncotator " +  VERSION + " |"

        datasourceStrings = []
        for ds in self._datasources:
            tx_mode_str = ""
            if isinstance(ds,TranscriptProvider):
                tx_mode_str = ds.get_tx_mode() + " "
            datasourceStrings.append(" " + ds.title + " " + ds.version + " " + tx_mode_str)
        
        return onco_string + "|".join(datasourceStrings)
    
    def _annotate_mutations_using_datasources(self, mutations):
        """
        Perform the actual annotating of mutations with the datasources.  Make sure to check the cache as well.

        :param MutationData mutations: iterable of MutationData
        :return generator : MutationData generator
        """
        if len(self._datasources) == 0:
            self.logger.warn("THERE ARE NO DATASOURCES REGISTERED")

        is_cache_being_used = (self._cacheManager is not None)

        for m in mutations:

            # If the alt_allele_seen annotation is present and False, skip this mutation
            if self._is_skip_no_alts and m.get("alt_allele_seen", "True") == "False":
                continue

            cache_annot_dict = None
            if is_cache_being_used:
                cache_annot_dict = self._cacheManager.retrieve_cached_annotations(m)

            # If no cache results were found, annotate normally.
            if cache_annot_dict is None:
                for datasource in self._datasources:

                    # This will evaluate to datasource.annotate_mutation(m) or datasource.annotate_segment(m)
                    m = self._annotate_func_ptr(m, datasource)

                if is_cache_being_used:
                    self._cache_stats['miss'] += 1
                    self._cacheManager.store_annotations_in_cache(m)
            else:
                self._cache_stats['hit'] += 1
                m.addAnnotations(cache_annot_dict)
            yield m
Example #10
0
class Annotator(object):
    """
    The Annotator is the entry point to actually perform the annotating of mutations.  The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources.   
    
    This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources).
    
    For information on how to initialize the input and output creator, please see the documentation of those classes.

    See the RunSpecification class, which allows for more control
        of an annotator.

    Example usage (with RunSpec and no multicore usage):
    # Create a run configuration to pass to the Annotator class.  See OncotatorCLIUtils.getSupportedOutputFormats()
    #   and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings.
    manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'}
    runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False)

    annotator = Annotator()
    annotator.initialize(runConfig)
    annotator.annotate()

    Example usage (used in testing, without RunSpec):
        # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively.
        # 1) Initialize the Annotator
        annotator = Annotator()
        annotator.setInputCreator(myIC)
        annotator.setOutputCreator(myOC)
        # 1a) For each datasource (instance of a datasource class), add it to the annotator.
        for datasource in myDataSources:
            annotator.addDatasource(datasource)
        # 2)  Produce the output
        filePointer = annotator.annotate()
    
    NOTE:  While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore.    
    """

    def __init__(self):
        """
        options should contain the following name-value pairs as a dict: 
        
        Create a new instance of Annotator.
        
        In order to specify the input and output creators and datasources, use the set and addDatasource methods.
        
        """
        self._inputCreator = None
        self._outputRenderer = None
        self._datasources = []
        self.logger = logging.getLogger(__name__)
        self._manualAnnotations = dict()
        self._defaultAnnotations = dict()
        self._isMulticore = None
        self._numCores = None
        self._cacheManager = CacheManager()
        self._cacheManager.initialize(None, "not_used")
        self._cache_stats = {"miss": 0, "hit":0}
        self._is_skip_no_alts = False
        pass

    def getIsMulticore(self):
        return self.__isMulticore

    def getNumCores(self):
        return self.__numCores

    def setIsMulticore(self, value):
        self.__isMulticore = value

    def setNumCores(self, value):
        self.__numCores = value

    def setInputCreator(self, inputCreator):
        self._inputCreator = inputCreator
        
    def setOutputRenderer(self, outputCreator):
        self._outputRenderer = outputCreator
    
    def setManualAnnotations(self, value):
        self._manualAnnotations = value

    def setDefaultAnnotations(self, value):
        self._defaultAnnotations = value

    def create_db_dir_key(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        self.logger.info("Generating db-dir key from datasources...")
        hasher = Hasher()
        for ds in self._datasources:
            self.logger.info(ds.title + " " + ds.version + " md5: " + ds.get_hashcode())
            hasher.update(ds.get_hashcode())
        db_dir_key = Hasher.md5_hash(hasher.hexdigest())
        self.logger.info("Final db-dir md5: " + db_dir_key)
        return db_dir_key

    def create_db_dir_key_simple(self):
        """Create the db_dir_key for this annotation configuration.  Requires the datasources."""
        db_dir_key = Hasher.md5_hash(self.createHeaderString(False))
        return db_dir_key

    def initialize_cache_manager(self, runSpec):
        """Do not bother calculating the db_dir_key if the cache is not being used. """
        cache_url = runSpec.get_cache_url()
        if cache_url is not None and cache_url != "":
            db_dir_key = self.create_db_dir_key()
            self._cacheManager = CacheManager()
            self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache())
        else:
            db_dir_key = "never_used"
            self._cacheManager = None

    def initialize(self,runSpec):
        """ Given a RunSpecification instance, initialize self properly.  Do not start annotation.
        """
        self.setInputCreator(runSpec.inputCreator)
        self.setOutputRenderer(runSpec.outputRenderer)
        self.setManualAnnotations(runSpec.manualAnnotations)
        self.setDefaultAnnotations(runSpec.defaultAnnotations)
        self._datasources = runSpec.datasources
        self.setIsMulticore(runSpec.get_is_multicore())
        self.setNumCores(runSpec.get_num_cores())
        self._cache_stats = {"miss": 0, "hit":0}
        self._is_skip_no_alts = runSpec.get_is_skip_no_alts()
        self.initialize_cache_manager(runSpec)

    def addDatasource(self, datasource):
        self._datasources.append(datasource)

    def _createMetadata(self):
        metadata = self._inputCreator.getMetadata()
        metadata.update(self._createManualAnnotationsForMetadata(self._manualAnnotations))
        return metadata

    def _createComments(self):
        comments = self._inputCreator.getComments()
        comments.append(self.createHeaderString())
        return comments

    def annotate_mutations(self, mutations):
        mutations = self._annotate_mutations_using_datasources(mutations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after annotation.")

        mutations = self._applyDefaultAnnotations(mutations, self._defaultAnnotations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after default annotations.")

        mutations = self._applyManualAnnotations(mutations, self._manualAnnotations)
        if mutations is None:
            self.logger.warn("Mutation list points to None after manual annotations.")

        return mutations

    def annotate(self):
        """
        Annotate the given mutations specified in the input.

        Call this after the input, output, and datasources have been set.

        :return: outputFilename
        """
        self.logger.info("Annotating with " + str(len(self._datasources)) + " datasources: " + self.createHeaderString())
        
        mutations = self._inputCreator.createMutations()
        if mutations is None: 
            self.logger.warn("Mutation list points to None after creation.")

        mutations = self.annotate_mutations(mutations)

        comments = self._createComments()
        metadata = self._createMetadata()

        filename = self._outputRenderer.renderMutations(mutations, metadata=metadata, comments=comments)

        if self._cacheManager is not None:
            self.logger.info("Closing cache: (misses: " + str(self._cache_stats['miss']) + "  hits: " + str(self._cache_stats['hit']) + ")")
            self._cacheManager.close_cache()

        return filename
    
    def _applyManualAnnotations(self, mutations, manualAnnotations):
        manualAnnotationKeys = manualAnnotations.keys()

        for m in mutations:
            for k in manualAnnotationKeys:
                # newRequired = False allows this call to overwrite the previous value.
                m.createAnnotation(k, manualAnnotations[k], annotationSource="MANUAL", newRequired=False)
            yield m

    def _applyDefaultAnnotations(self, mutations, defaultAnnotations):
        defaultAnnotationsKeys = defaultAnnotations.keys()
        for m in mutations:
            mKeys = m.keys()
            for k in defaultAnnotationsKeys:
                if k not in mKeys:
                    m.createAnnotation(k, defaultAnnotations[k], annotationSource="DEFAULT")
                if m[k] == "":
                    m.getAnnotation(k).setDatasource("DEFAULT")
                    m.getAnnotation(k).setValue(defaultAnnotations[k])
            yield m

    def _createManualAnnotationsForMetadata(self, manualAnnotations):
        result = {}
        manualAnnotationKeys = manualAnnotations.keys()
        for k in manualAnnotationKeys:
            result[k] = Annotation(manualAnnotations[k], datasourceName="MANUAL")
        return result

    def createHeaderString(self, is_giving_oncotator_version=True):
        """
        Create a default header string that lists version of Oncotator and datasource information.

        :return: string
        """
        onco_string = ""
        if is_giving_oncotator_version:
            onco_string = "Oncotator " +  VERSION + " |"

        datasourceStrings = []
        for ds in self._datasources:
            datasourceStrings.append(" " + ds.title + " " + ds.version + " ")
        
        return onco_string + "|".join(datasourceStrings)
    
    def _annotate_mutations_using_datasources(self, mutations):
        if len(self._datasources) == 0:
            self.logger.warn("THERE ARE NO DATASOURCES REGISTERED")
        for m in mutations:

            # If the alt_allele_seen annotation is present and False, skip this mutation
            if self._is_skip_no_alts and m.get("alt_allele_seen", "True") == "False":
                continue

            annot_dict = None
            if self._cacheManager is not None:
                annot_dict = self._cacheManager.retrieve_cached_annotations(m)

            if annot_dict is None:
                for datasource in self._datasources:
                    m = datasource.annotate_mutation(m)

                if self._cacheManager is not None:
                    self._cache_stats['miss'] += 1
                    self._cacheManager.store_annotations_in_cache(m)
            else:
                self._cache_stats['hit'] += 1
                m.addAnnotations(annot_dict)
            yield m