def __init__(self): """ options should contain the following name-value pairs as a dict: Create a new instance of Annotator. In order to specify the input and output creators and datasources, use the set and addDatasource methods. """ self._inputCreator = None self._outputRenderer = None self._datasources = [] self.logger = logging.getLogger(__name__) self._manualAnnotations = dict() self._defaultAnnotations = dict() self._isMulticore = None self._numCores = None self._cacheManager = CacheManager() self._cacheManager.initialize(None, "not_used") self._cache_stats = {"miss": 0, "hit": 0} self._is_skip_no_alts = False self._annotating_type = None self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get( self._annotating_type, _annotate_mut) pass
def initialize_cache_manager(self, runSpec): """Do not bother calculating the db_dir_key if the cache is not being used. """ cache_url = runSpec.get_cache_url() if cache_url is not None and cache_url != "": db_dir_key = self.create_db_dir_key() self._cacheManager = CacheManager() self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache()) else: db_dir_key = "never_used" self._cacheManager = None
def test_cached_annots_dummy_cache(self): """Test dummy cache. Also, tests a simple store and retrieve, which should be None.""" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize(None, fake_db_dir_key, is_read_only=False) m = MutationDataFactory.default_create() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(annots is None)
def test_cached_annots_dummy_cache(self): """Test dummy cache. Also, tests a simple store and retrieve, which should be None.""" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize(None, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(annots is None)
def test_cached_annots(self): """Test to make sure that we are not storing annotations that should not be cached. Also, tests a simple store and retrieve.""" cache_file = "out/shove.managertest.annots.cache" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize("file://" + cache_file, fake_db_dir_key, is_read_only=False) m = MutationDataFactory.default_create() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(len(annots.keys()) == 1) self.assertTrue(annots["blah2"].getValue() == "val5")
def test_cached_annots(self): """Test to make sure that we are not storing annotations that should not be cached. Also, tests a simple store and retrieve.""" cache_file = "out/shove.managertest.annots.cache" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize("file://" + cache_file, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(len(annots.keys()) == 1) self.assertTrue(annots["blah2"].getValue() == "val5")
def __init__(self): """ options should contain the following name-value pairs as a dict: Create a new instance of Annotator. In order to specify the input and output creators and datasources, use the set and addDatasource methods. """ self._inputCreator = None self._outputRenderer = None self._datasources = [] self.logger = logging.getLogger(__name__) self._manualAnnotations = dict() self._defaultAnnotations = dict() self._isMulticore = None self._numCores = None self._cacheManager = CacheManager() self._cacheManager.initialize(None, "not_used") self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = False pass
class Annotator(object): """ The Annotator is the entry point to actually perform the annotating of mutations. The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources. This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources). For information on how to initialize the input and output creator, please see the documentation of those classes. See the RunSpecification class, which allows for more control of an annotator. Example usage (with RunSpec and no multicore usage): # Create a run configuration to pass to the Annotator class. See OncotatorCLIUtils.getSupportedOutputFormats() # and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings. manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'} runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False) annotator = Annotator() annotator.initialize(runConfig) annotator.annotate() Example usage (used in testing, without RunSpec): # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(myIC) annotator.setOutputCreator(myOC) # 1a) For each datasource (instance of a datasource class), add it to the annotator. for datasource in myDataSources: annotator.addDatasource(datasource) # 2) Produce the output filePointer = annotator.annotate() NOTE: While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore. NOTE: If we ever change attributes on a TranscriptProvider in the middle of an annotation run, we will need to re-generate the md5 """ ANNOTATING_FUNC_DICT = { RunSpecification.ANNOTATE_MUTATIONS: _annotate_mut, RunSpecification.ANNOTATE_SEGMENTS: _annotate_seg } ANNOTATING_DS_DICT = { RunSpecification.ANNOTATE_MUTATIONS: Datasource, RunSpecification.ANNOTATE_SEGMENTS: SegmentDatasource } def __init__(self): """ options should contain the following name-value pairs as a dict: Create a new instance of Annotator. In order to specify the input and output creators and datasources, use the set and addDatasource methods. """ self._inputCreator = None self._outputRenderer = None self._datasources = [] self.logger = logging.getLogger(__name__) self._manualAnnotations = dict() self._defaultAnnotations = dict() self._isMulticore = None self._numCores = None self._cacheManager = CacheManager() self._cacheManager.initialize(None, "not_used") self._cache_stats = {"miss": 0, "hit": 0} self._is_skip_no_alts = False self._annotating_type = None self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get( self._annotating_type, _annotate_mut) pass def getIsMulticore(self): return self.__isMulticore def getNumCores(self): return self.__numCores def setIsMulticore(self, value): self.__isMulticore = value def setNumCores(self, value): self.__numCores = value def setInputCreator(self, inputCreator): self._inputCreator = inputCreator def setOutputRenderer(self, outputCreator): self._outputRenderer = outputCreator def setManualAnnotations(self, value): self._manualAnnotations = value def setDefaultAnnotations(self, value): self._defaultAnnotations = value def set_annotating_type(self, value): self._annotating_type = value def create_db_dir_key(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" self.logger.info("Generating db-dir key from datasources...") hasher = Hasher() for ds in self._datasources: self.logger.info(ds.title + " " + ds.version + " md5: " + ds.get_hashcode()) hasher.update(ds.get_hashcode()) db_dir_key = Hasher.md5_hash(hasher.hexdigest()) self.logger.info("Final db-dir md5: " + db_dir_key) return db_dir_key def create_db_dir_key_simple(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" db_dir_key = Hasher.md5_hash(self.createHeaderString(False)) return db_dir_key def initialize_cache_manager(self, runSpec): """Do not bother calculating the db_dir_key if the cache is not being used. """ cache_url = runSpec.get_cache_url() if cache_url is not None and cache_url != "": db_dir_key = self.create_db_dir_key() self._cacheManager = CacheManager() self._cacheManager.initialize( cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache()) else: db_dir_key = "never_used" self._cacheManager = None def initialize(self, run_spec): """ Given a RunSpecification instance, initialize self properly. Do not start annotation. """ self.setInputCreator(run_spec.inputCreator) self.setOutputRenderer(run_spec.outputRenderer) self.setManualAnnotations(run_spec.manualAnnotations) self.setDefaultAnnotations(run_spec.defaultAnnotations) self._datasources = run_spec.datasources self.setIsMulticore(run_spec.get_is_multicore()) self.setNumCores(run_spec.get_num_cores()) self._cache_stats = {"miss": 0, "hit": 0} self._is_skip_no_alts = run_spec.get_is_skip_no_alts() self.initialize_cache_manager(run_spec) self.set_annotating_type(run_spec.annotating_type) self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get( self._annotating_type, _annotate_mut) def addDatasource(self, datasource): self._datasources.append(datasource) def _createMetadata(self): metadata = self._inputCreator.getMetadata() metadata.update( self._createManualAnnotationsForMetadata(self._manualAnnotations)) return metadata def _createComments(self): comments = self._inputCreator.getComments() comments.append(self.createHeaderString()) return comments def retrieve_transcript_by_id(self, transcript_id): # Get the Transcript Datasource if self._datasources is None or len(self._datasources) == 0: logging.getLogger(__name__).warn( "Attempting to retrieve transcripts, but no datasources are initialized." ) for ds in self._datasources: if isinstance(ds, TranscriptProvider): return ds.get_transcript(transcript_id) return None def retrieve_transcripts_by_genes(self, genes): """ Given names of genes, return all transcripts Datasources, particularly a TranscriptDatasource should be initialized before calling this method. :param list genes:List of str gene names :returns list: List of Transcripts """ # Get the Transcript Datasource if self._datasources is None or len(self._datasources) == 0: logging.getLogger(__name__).warn( "Attempting to retrieve transcripts by gene, but no datasources are initialized." ) txs = [] for ds in self._datasources: if isinstance(ds, TranscriptProvider): for gene in genes: txs.extend(ds.retrieve_transcripts_by_gene(gene)) return txs def retrieve_transcripts_by_region(self, chrom, start, end): """ Finds all TrnascriptProviders and gets the transcripts in the given genomic region (in genomic coords) :rtype : list :param chrom: :param start: :param end: """ txs = [] for ds in self._datasources: if isinstance(ds, TranscriptProvider): txs.extend(ds.get_transcripts_by_pos(chrom, start, end)) return txs def annotate_transcript(self, tx): """ Given a transcript, get all transcript annotations on a mutation. HACK: Looks only for GenericTranscriptDatasources HACK: Not actually annotating a transcript. Creates a dummy mutation and then only looks to annotate with GenericTranscriptDatasources :param Transcript tx: transcript to annotate :returns MutationData: mutation with annotations generated from the given transcript """ m = MutationData() m['transcript_id'] = tx.get_transcript_id() for ds in self._datasources: if isinstance(ds, GenericTranscriptDatasource): m = ds.annotate_mutation(m) return m def _annotate_genes(self, muts): """ Given a set of mutations (with the gene annotation), annotate with values from relevant datasources. :param muts: iterable of MutationData :rtype : None HACK: "relevant" is simply GenericGeneDatasources mutations are annotated in place. """ for m in muts: for ds in self._datasources: if isinstance(ds, GenericGeneDatasource) or \ isinstance(ds, GenericGeneProteinPositionDatasource): m = ds.annotate_mutation(m) def annotate_genes_given_txs(self, txs): gene_to_tx_dict = {} for tx in txs: try: gene_to_tx_dict[tx.get_gene()].append(tx) except KeyError: gene_to_tx_dict[tx.get_gene()] = [tx] genes = set(gene_to_tx_dict.keys()) genes = sorted(list(genes)) muts_dict = {} for gene in genes: m = MutationData() m.createAnnotation("gene", gene) m.createAnnotation( "transcripts", ",".join( sorted([ tx.get_transcript_id() for tx in gene_to_tx_dict[gene] ]))) m.createAnnotation("strand", gene_to_tx_dict[gene][0].get_strand()) m.createAnnotation("class", gene_to_tx_dict[gene][0].get_gene_type()) endAA = str( max([ len(tx.get_protein_seq()) for tx in gene_to_tx_dict[gene] ])) m.createAnnotation("protein_change", "p.DUMMY1_" + endAA) m.createAnnotation("chr", gene_to_tx_dict[gene][0].get_contig()) muts_dict[gene] = m self._annotate_genes(muts_dict.values()) return muts_dict def annotate_mutations(self, mutations): """ Given a list of mutations (or any iterable of mutations), return a list of annotated mutations. :rtype : list :param mutations: iterator of MutationData """ mutations = self._annotate_mutations_using_datasources(mutations) if mutations is None: self.logger.warn("Mutation list points to None after annotation.") mutations = self._applyDefaultAnnotations(mutations, self._defaultAnnotations) if mutations is None: self.logger.warn( "Mutation list points to None after default annotations.") mutations = self._applyManualAnnotations(mutations, self._manualAnnotations) if mutations is None: self.logger.warn( "Mutation list points to None after manual annotations.") return mutations def _prune_datasources_by_annotating_type(self): # Remove datasources that do not match the annotation type (segment or mutation) datasource_class = Annotator.ANNOTATING_DS_DICT.get( self._annotating_type, RunSpecification.ANNOTATE_MUTATIONS) pruned_ds = [] for ds in self._datasources: if not isinstance(ds, datasource_class): logging.getLogger(__name__).info( "Removing %s, since it does not support annotating %s" % (ds.title, str(self._annotating_type))) else: pruned_ds.append(ds) return pruned_ds def annotate(self): """ Annotate the given mutations specified in the input. Call this after the input, output, and datasources have been set. :return: outputFilename """ if self._annotating_type is None: self._annotating_type = RunSpecification.ANNOTATE_MUTATIONS self._datasources = self._prune_datasources_by_annotating_type() self.logger.info("Annotating with " + str(len(self._datasources)) + " datasources: " + self.createHeaderString()) mutations = self._inputCreator.createMutations() if mutations is None: self.logger.warn("Mutation list points to None after creation.") mutations = self.annotate_mutations(mutations) comments = self._createComments() metadata = self._createMetadata() filename = self._outputRenderer.renderMutations(mutations, metadata=metadata, comments=comments) if self._cacheManager is not None: self.logger.info("Closing cache: (misses: " + str(self._cache_stats['miss']) + " hits: " + str(self._cache_stats['hit']) + ")") self._cacheManager.close_cache() return filename def _applyManualAnnotations(self, mutations, manualAnnotations): manualAnnotationKeys = manualAnnotations.keys() for m in mutations: for k in manualAnnotationKeys: # newRequired = False allows this call to overwrite the previous value. m.createAnnotation(k, manualAnnotations[k], annotationSource="MANUAL", newRequired=False) yield m def _applyDefaultAnnotations(self, mutations, defaultAnnotations): defaultAnnotationsKeys = defaultAnnotations.keys() for m in mutations: mKeys = m.keys() for k in defaultAnnotationsKeys: if k not in mKeys: m.createAnnotation(k, defaultAnnotations[k], annotationSource="DEFAULT") if m[k] == "": m.getAnnotation(k).setDatasource("DEFAULT") m.getAnnotation(k).setValue(defaultAnnotations[k]) yield m def _createManualAnnotationsForMetadata(self, manualAnnotations): result = {} manualAnnotationKeys = manualAnnotations.keys() for k in manualAnnotationKeys: result[k] = Annotation(manualAnnotations[k], datasourceName="MANUAL") return result def createHeaderString(self, is_giving_oncotator_version=True): """ Create a default header string that lists version of Oncotator and datasource information. :return str: header string with the "|" delimiter """ onco_string = "" if is_giving_oncotator_version: onco_string = "Oncotator " + VERSION + " |" datasourceStrings = [] for ds in self._datasources: tx_mode_str = "" if isinstance(ds, TranscriptProvider): tx_mode_str = ds.get_tx_mode() + " " datasourceStrings.append(" " + ds.title + " " + ds.version + " " + tx_mode_str) return onco_string + "|".join(datasourceStrings) def _annotate_mutations_using_datasources(self, mutations): """ Perform the actual annotating of mutations with the datasources. Make sure to check the cache as well. :param MutationData mutations: iterable of MutationData :return generator : MutationData generator """ if len(self._datasources) == 0: self.logger.warn("THERE ARE NO DATASOURCES REGISTERED") is_cache_being_used = (self._cacheManager is not None) for m in mutations: # If the alt_allele_seen annotation is present and False, skip this mutation if self._is_skip_no_alts and m.get("alt_allele_seen", "True") == "False": continue cache_annot_dict = None if is_cache_being_used: cache_annot_dict = self._cacheManager.retrieve_cached_annotations( m) # If no cache results were found, annotate normally. if cache_annot_dict is None: for datasource in self._datasources: # This will evaluate to datasource.annotate_mutation(m) or datasource.annotate_segment(m) m = self._annotate_func_ptr(m, datasource) if is_cache_being_used: self._cache_stats['miss'] += 1 self._cacheManager.store_annotations_in_cache(m) else: self._cache_stats['hit'] += 1 m.addAnnotations(cache_annot_dict) yield m
class Annotator(object): """ The Annotator is the entry point to actually perform the annotating of mutations. The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources. This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources). For information on how to initialize the input and output creator, please see the documentation of those classes. See the RunSpecification class, which allows for more control of an annotator. Example usage (with RunSpec and no multicore usage): # Create a run configuration to pass to the Annotator class. See OncotatorCLIUtils.getSupportedOutputFormats() # and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings. manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'} runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False) annotator = Annotator() annotator.initialize(runConfig) annotator.annotate() Example usage (used in testing, without RunSpec): # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(myIC) annotator.setOutputCreator(myOC) # 1a) For each datasource (instance of a datasource class), add it to the annotator. for datasource in myDataSources: annotator.addDatasource(datasource) # 2) Produce the output filePointer = annotator.annotate() NOTE: While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore. NOTE: If we ever change attributes on a TranscriptProvider in the middle of an annotation run, we will need to re-generate the md5 """ ANNOTATING_FUNC_DICT = {RunSpecification.ANNOTATE_MUTATIONS: _annotate_mut, RunSpecification.ANNOTATE_SEGMENTS: _annotate_seg} ANNOTATING_DS_DICT = {RunSpecification.ANNOTATE_MUTATIONS: Datasource, RunSpecification.ANNOTATE_SEGMENTS: SegmentDatasource} def __init__(self): """ options should contain the following name-value pairs as a dict: Create a new instance of Annotator. In order to specify the input and output creators and datasources, use the set and addDatasource methods. """ self._inputCreator = None self._outputRenderer = None self._datasources = [] self.logger = logging.getLogger(__name__) self._manualAnnotations = dict() self._defaultAnnotations = dict() self._isMulticore = None self._numCores = None self._cacheManager = CacheManager() self._cacheManager.initialize(None, "not_used") self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = False self._annotating_type = None self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(self._annotating_type, _annotate_mut) self._is_allow_annotation_overwriting = None def getIsMulticore(self): return self.__isMulticore def getNumCores(self): return self.__numCores def setIsMulticore(self, value): self.__isMulticore = value def setNumCores(self, value): self.__numCores = value def setInputCreator(self, inputCreator): self._inputCreator = inputCreator def setOutputRenderer(self, outputCreator): self._outputRenderer = outputCreator def setManualAnnotations(self, value): self._manualAnnotations = value def setDefaultAnnotations(self, value): self._defaultAnnotations = value def set_annotating_type(self, value): self._annotating_type = value def create_db_dir_key(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" self.logger.info("Generating db-dir key from datasources...") hasher = Hasher() for ds in self._datasources: self.logger.info(ds.title + " " + ds.version + " md5: " + ds.get_hashcode()) hasher.update(ds.get_hashcode()) db_dir_key = Hasher.md5_hash(hasher.hexdigest()) self.logger.info("Final db-dir md5: " + db_dir_key) return db_dir_key def create_db_dir_key_simple(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" db_dir_key = Hasher.md5_hash(self.createHeaderString(False)) return db_dir_key def initialize_cache_manager(self, runSpec): """Do not bother calculating the db_dir_key if the cache is not being used. """ cache_url = runSpec.get_cache_url() if cache_url is not None and cache_url != "": db_dir_key = self.create_db_dir_key() self._cacheManager = CacheManager() self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache()) else: db_dir_key = "never_used" self._cacheManager = None def initialize(self, run_spec): """ Given a RunSpecification instance, initialize self properly. Do not start annotation. """ self.setInputCreator(run_spec.inputCreator) self.setOutputRenderer(run_spec.outputRenderer) self.setManualAnnotations(run_spec.manualAnnotations) self.setDefaultAnnotations(run_spec.defaultAnnotations) self._datasources = run_spec.datasources self.setIsMulticore(run_spec.get_is_multicore()) self.setNumCores(run_spec.get_num_cores()) self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = run_spec.get_is_skip_no_alts() self.initialize_cache_manager(run_spec) self.set_annotating_type(run_spec.annotating_type) self._annotate_func_ptr = Annotator.ANNOTATING_FUNC_DICT.get(self._annotating_type, _annotate_mut) self._is_allow_annotation_overwriting = run_spec.is_allow_annotation_overwriting self._mutation_data_factory = MutationDataFactory(allow_overwriting=self._is_allow_annotation_overwriting) def addDatasource(self, datasource): self._datasources.append(datasource) def _createMetadata(self): metadata = self._inputCreator.getMetadata() metadata.update(self._createManualAnnotationsForMetadata(self._manualAnnotations)) return metadata def _createComments(self): comments = self._inputCreator.getComments() comments.append(self.createHeaderString()) return comments def retrieve_transcript_by_id(self, transcript_id): # Get the Transcript Datasource if self._datasources is None or len(self._datasources) == 0: logging.getLogger(__name__).warn("Attempting to retrieve transcripts, but no datasources are initialized.") for ds in self._datasources: if isinstance(ds, TranscriptProvider): return ds.get_transcript(transcript_id) return None def retrieve_transcripts_by_genes(self, genes): """ Given names of genes, return all transcripts Datasources, particularly a TranscriptDatasource should be initialized before calling this method. :param list genes:List of str gene names :returns list: List of Transcripts """ # Get the Transcript Datasource if self._datasources is None or len(self._datasources) == 0: logging.getLogger(__name__).warn("Attempting to retrieve transcripts by gene, but no datasources are initialized.") txs = [] for ds in self._datasources: if isinstance(ds, TranscriptProvider): for gene in genes: txs.extend(ds.retrieve_transcripts_by_gene(gene)) return txs def retrieve_transcripts_by_region(self, chrom, start, end): """ Finds all TrnascriptProviders and gets the transcripts in the given genomic region (in genomic coords) :rtype : list :param chrom: :param start: :param end: """ txs = [] for ds in self._datasources: if isinstance(ds, TranscriptProvider): txs.extend(ds.get_transcripts_by_pos(chrom, start, end)) return txs def annotate_transcript(self, tx): """ Given a transcript, get all transcript annotations on a mutation. HACK: Looks only for GenericTranscriptDatasources HACK: Not actually annotating a transcript. Creates a dummy mutation and then only looks to annotate with GenericTranscriptDatasources :param Transcript tx: transcript to annotate :returns MutationData: mutation with annotations generated from the given transcript """ m = MutationDataFactory.default_create() m['transcript_id'] = tx.get_transcript_id() for ds in self._datasources: if isinstance(ds, GenericTranscriptDatasource): m = ds.annotate_mutation(m) return m def _annotate_genes(self, muts): """ Given a set of mutations (with the gene annotation), annotate with values from relevant datasources. :param muts: iterable of MutationData :rtype : None HACK: "relevant" is simply GenericGeneDatasources mutations are annotated in place. """ for m in muts: for ds in self._datasources: if isinstance(ds, GenericGeneDatasource) or \ isinstance(ds, GenericGeneProteinPositionDatasource): m = ds.annotate_mutation(m) def annotate_genes_given_txs(self, txs): """ Given a list of Transcripts, create and annotate dummy mutations that represent only the gene. :param txs: list of Transcripts :type txs: list :return: """ gene_to_tx_dict = {} for tx in txs: try: gene_to_tx_dict[tx.get_gene()].append(tx) except KeyError: gene_to_tx_dict[tx.get_gene()] = [tx] genes = set(gene_to_tx_dict.keys()) genes = sorted(list(genes)) muts_dict = {} for gene in genes: m = MutationDataFactory.default_create() m.createAnnotation("gene", gene) m.createAnnotation("transcripts", ",".join(sorted([tx.get_transcript_id() for tx in gene_to_tx_dict[gene]]))) m.createAnnotation("strand", gene_to_tx_dict[gene][0].get_strand()) m.createAnnotation("class", gene_to_tx_dict[gene][0].get_gene_type()) endAA = str(max([len(tx.get_protein_seq()) for tx in gene_to_tx_dict[gene]])) m.createAnnotation("protein_change", "p.DUMMY1_" + endAA) m.createAnnotation("chr", gene_to_tx_dict[gene][0].get_contig()) # Annotate each transcript and collapse the relevant transcript annotations for each gene. tx_muts_uncollapsed = [self.annotate_transcript(tx) for tx in gene_to_tx_dict[gene]] annotation_vals_collapsed = defaultdict(set) for tx_mut in tx_muts_uncollapsed: for annotation_name in tx_mut.keys(): # For every annotation on the dummy transcript (tx_mut), create a dictionary containing a # set of values. # Only consider annotations that are not INPUT and the datasource is known. invalid_annotation_sources = ["INPUT", "OUTPUT", "Unknown"] if tx_mut.getAnnotation(annotation_name).getDatasource() not in invalid_annotation_sources: annotation_vals_collapsed[annotation_name].add(tx_mut[annotation_name]) # Create a new annotation that encompasses the transcript data for the gene. for new_annotation in annotation_vals_collapsed.keys(): # Remove blank values from the set annotation_val_collapsed_set = annotation_vals_collapsed[new_annotation] - set([""]) str_val = "|".join(sorted(list(annotation_val_collapsed_set))) m.createAnnotation(new_annotation, str_val, annotationSource="OUTPUT") muts_dict[gene] = m self._annotate_genes(muts_dict.values()) return muts_dict def annotate_mutations(self, mutations): """ Given a list of mutations (or any iterable of mutations), return a list of annotated mutations. :rtype : list :param mutations: iterator of MutationData """ mutations = self._annotate_mutations_using_datasources(mutations) if mutations is None: self.logger.warn("Mutation list points to None after annotation.") mutations = self._applyDefaultAnnotations(mutations, self._defaultAnnotations) if mutations is None: self.logger.warn("Mutation list points to None after default annotations.") mutations = self._applyManualAnnotations(mutations, self._manualAnnotations) if mutations is None: self.logger.warn("Mutation list points to None after manual annotations.") return mutations def _prune_datasources_by_annotating_type(self): # Remove datasources that do not match the annotation type (segment or mutation) datasource_class = Annotator.ANNOTATING_DS_DICT.get(self._annotating_type, RunSpecification.ANNOTATE_MUTATIONS) pruned_ds = [] for ds in self._datasources: if not isinstance(ds, datasource_class): logging.getLogger(__name__).info( "Removing %s, since it does not support annotating %s" % (ds.title, str(self._annotating_type))) else: pruned_ds.append(ds) return pruned_ds def annotate(self): """ Annotate the given mutations specified in the input. Call this after the input, output, and datasources have been set. :return: outputFilename """ if self._annotating_type is None: self._annotating_type = RunSpecification.ANNOTATE_MUTATIONS self._datasources = self._prune_datasources_by_annotating_type() self.logger.info("Annotating with " + str(len(self._datasources)) + " datasources: " + self.createHeaderString()) mutations = self._inputCreator.createMutations() if mutations is None: self.logger.warn("Mutation list points to None after creation.") mutations = self.annotate_mutations(mutations) comments = self._createComments() metadata = self._createMetadata() filename = self._outputRenderer.renderMutations(mutations, metadata=metadata, comments=comments) if self._cacheManager is not None: self.logger.info("Closing cache: (misses: " + str(self._cache_stats['miss']) + " hits: " + str(self._cache_stats['hit']) + ")") self._cacheManager.close_cache() return filename def _applyManualAnnotations(self, mutations, manualAnnotations): manualAnnotationKeys = manualAnnotations.keys() for m in mutations: for k in manualAnnotationKeys: # newRequired = False allows this call to overwrite the previous value. m.createAnnotation(k, manualAnnotations[k], annotationSource="MANUAL", newRequired=False) yield m def _applyDefaultAnnotations(self, mutations, defaultAnnotations): defaultAnnotationsKeys = defaultAnnotations.keys() for m in mutations: mKeys = m.keys() for k in defaultAnnotationsKeys: if k not in mKeys: m.createAnnotation(k, defaultAnnotations[k], annotationSource="DEFAULT") if m[k] == "": m.getAnnotation(k).setDatasource("DEFAULT") m.getAnnotation(k).setValue(defaultAnnotations[k]) yield m def _createManualAnnotationsForMetadata(self, manualAnnotations): result = {} manualAnnotationKeys = manualAnnotations.keys() for k in manualAnnotationKeys: result[k] = Annotation(manualAnnotations[k], datasourceName="MANUAL") return result def createHeaderString(self, is_giving_oncotator_version=True): """ Create a default header string that lists version of Oncotator and datasource information. :return str: header string with the "|" delimiter """ onco_string = "" if is_giving_oncotator_version: onco_string = "Oncotator " + VERSION + " |" datasourceStrings = [] for ds in self._datasources: tx_mode_str = "" if isinstance(ds,TranscriptProvider): tx_mode_str = ds.get_tx_mode() + " " datasourceStrings.append(" " + ds.title + " " + ds.version + " " + tx_mode_str) return onco_string + "|".join(datasourceStrings) def _annotate_mutations_using_datasources(self, mutations): """ Perform the actual annotating of mutations with the datasources. Make sure to check the cache as well. :param MutationData mutations: iterable of MutationData :return generator : MutationData generator """ if len(self._datasources) == 0: self.logger.warn("THERE ARE NO DATASOURCES REGISTERED") is_cache_being_used = (self._cacheManager is not None) for m in mutations: # If the alt_allele_seen annotation is present and False, skip this mutation if self._is_skip_no_alts and m.get("alt_allele_seen", "True") == "False": continue cache_annot_dict = None if is_cache_being_used: cache_annot_dict = self._cacheManager.retrieve_cached_annotations(m) # If no cache results were found, annotate normally. if cache_annot_dict is None: for datasource in self._datasources: # This will evaluate to datasource.annotate_mutation(m) or datasource.annotate_segment(m) m = self._annotate_func_ptr(m, datasource) if is_cache_being_used: self._cache_stats['miss'] += 1 self._cacheManager.store_annotations_in_cache(m) else: self._cache_stats['hit'] += 1 m.addAnnotations(cache_annot_dict) yield m
class Annotator(object): """ The Annotator is the entry point to actually perform the annotating of mutations. The Annotator contains one input creator (IC), one output creator (OC), and a list of datasources. This class is responsible for the coordination of the annotating process, not the annotations themselves (this is handled by the datasources). For information on how to initialize the input and output creator, please see the documentation of those classes. See the RunSpecification class, which allows for more control of an annotator. Example usage (with RunSpec and no multicore usage): # Create a run configuration to pass to the Annotator class. See OncotatorCLIUtils.getSupportedOutputFormats() # and OncotatorCLIUtils.getSupportedInputFormats() for allowed inputFormat and outputFormat strings. manualOverrides = {'fake_annotation':'picard', 'fake_annotation2':'worf'} runConfig = OncotatorCLIUtils.createRunConfig(inputFormat, outputFormat, inputFilename, outputFilename, globalAnnotations=manualOverrides, datasourceDir="/home/onco/dbs", isMulticore=False) annotator = Annotator() annotator.initialize(runConfig) annotator.annotate() Example usage (used in testing, without RunSpec): # Assumed myIC and myOC have been initialized as the proper Input and Output Creators, respectively. # 1) Initialize the Annotator annotator = Annotator() annotator.setInputCreator(myIC) annotator.setOutputCreator(myOC) # 1a) For each datasource (instance of a datasource class), add it to the annotator. for datasource in myDataSources: annotator.addDatasource(datasource) # 2) Produce the output filePointer = annotator.annotate() NOTE: While multicore information is passed into the Annotator, currently, nothing is implemented that uses multicore. """ def __init__(self): """ options should contain the following name-value pairs as a dict: Create a new instance of Annotator. In order to specify the input and output creators and datasources, use the set and addDatasource methods. """ self._inputCreator = None self._outputRenderer = None self._datasources = [] self.logger = logging.getLogger(__name__) self._manualAnnotations = dict() self._defaultAnnotations = dict() self._isMulticore = None self._numCores = None self._cacheManager = CacheManager() self._cacheManager.initialize(None, "not_used") self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = False pass def getIsMulticore(self): return self.__isMulticore def getNumCores(self): return self.__numCores def setIsMulticore(self, value): self.__isMulticore = value def setNumCores(self, value): self.__numCores = value def setInputCreator(self, inputCreator): self._inputCreator = inputCreator def setOutputRenderer(self, outputCreator): self._outputRenderer = outputCreator def setManualAnnotations(self, value): self._manualAnnotations = value def setDefaultAnnotations(self, value): self._defaultAnnotations = value def create_db_dir_key(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" self.logger.info("Generating db-dir key from datasources...") hasher = Hasher() for ds in self._datasources: self.logger.info(ds.title + " " + ds.version + " md5: " + ds.get_hashcode()) hasher.update(ds.get_hashcode()) db_dir_key = Hasher.md5_hash(hasher.hexdigest()) self.logger.info("Final db-dir md5: " + db_dir_key) return db_dir_key def create_db_dir_key_simple(self): """Create the db_dir_key for this annotation configuration. Requires the datasources.""" db_dir_key = Hasher.md5_hash(self.createHeaderString(False)) return db_dir_key def initialize_cache_manager(self, runSpec): """Do not bother calculating the db_dir_key if the cache is not being used. """ cache_url = runSpec.get_cache_url() if cache_url is not None and cache_url != "": db_dir_key = self.create_db_dir_key() self._cacheManager = CacheManager() self._cacheManager.initialize(cache_url, db_dir_key, is_read_only=runSpec.get_is_read_only_cache()) else: db_dir_key = "never_used" self._cacheManager = None def initialize(self,runSpec): """ Given a RunSpecification instance, initialize self properly. Do not start annotation. """ self.setInputCreator(runSpec.inputCreator) self.setOutputRenderer(runSpec.outputRenderer) self.setManualAnnotations(runSpec.manualAnnotations) self.setDefaultAnnotations(runSpec.defaultAnnotations) self._datasources = runSpec.datasources self.setIsMulticore(runSpec.get_is_multicore()) self.setNumCores(runSpec.get_num_cores()) self._cache_stats = {"miss": 0, "hit":0} self._is_skip_no_alts = runSpec.get_is_skip_no_alts() self.initialize_cache_manager(runSpec) def addDatasource(self, datasource): self._datasources.append(datasource) def _createMetadata(self): metadata = self._inputCreator.getMetadata() metadata.update(self._createManualAnnotationsForMetadata(self._manualAnnotations)) return metadata def _createComments(self): comments = self._inputCreator.getComments() comments.append(self.createHeaderString()) return comments def annotate_mutations(self, mutations): mutations = self._annotate_mutations_using_datasources(mutations) if mutations is None: self.logger.warn("Mutation list points to None after annotation.") mutations = self._applyDefaultAnnotations(mutations, self._defaultAnnotations) if mutations is None: self.logger.warn("Mutation list points to None after default annotations.") mutations = self._applyManualAnnotations(mutations, self._manualAnnotations) if mutations is None: self.logger.warn("Mutation list points to None after manual annotations.") return mutations def annotate(self): """ Annotate the given mutations specified in the input. Call this after the input, output, and datasources have been set. :return: outputFilename """ self.logger.info("Annotating with " + str(len(self._datasources)) + " datasources: " + self.createHeaderString()) mutations = self._inputCreator.createMutations() if mutations is None: self.logger.warn("Mutation list points to None after creation.") mutations = self.annotate_mutations(mutations) comments = self._createComments() metadata = self._createMetadata() filename = self._outputRenderer.renderMutations(mutations, metadata=metadata, comments=comments) if self._cacheManager is not None: self.logger.info("Closing cache: (misses: " + str(self._cache_stats['miss']) + " hits: " + str(self._cache_stats['hit']) + ")") self._cacheManager.close_cache() return filename def _applyManualAnnotations(self, mutations, manualAnnotations): manualAnnotationKeys = manualAnnotations.keys() for m in mutations: for k in manualAnnotationKeys: # newRequired = False allows this call to overwrite the previous value. m.createAnnotation(k, manualAnnotations[k], annotationSource="MANUAL", newRequired=False) yield m def _applyDefaultAnnotations(self, mutations, defaultAnnotations): defaultAnnotationsKeys = defaultAnnotations.keys() for m in mutations: mKeys = m.keys() for k in defaultAnnotationsKeys: if k not in mKeys: m.createAnnotation(k, defaultAnnotations[k], annotationSource="DEFAULT") if m[k] == "": m.getAnnotation(k).setDatasource("DEFAULT") m.getAnnotation(k).setValue(defaultAnnotations[k]) yield m def _createManualAnnotationsForMetadata(self, manualAnnotations): result = {} manualAnnotationKeys = manualAnnotations.keys() for k in manualAnnotationKeys: result[k] = Annotation(manualAnnotations[k], datasourceName="MANUAL") return result def createHeaderString(self, is_giving_oncotator_version=True): """ Create a default header string that lists version of Oncotator and datasource information. :return: string """ onco_string = "" if is_giving_oncotator_version: onco_string = "Oncotator " + VERSION + " |" datasourceStrings = [] for ds in self._datasources: datasourceStrings.append(" " + ds.title + " " + ds.version + " ") return onco_string + "|".join(datasourceStrings) def _annotate_mutations_using_datasources(self, mutations): if len(self._datasources) == 0: self.logger.warn("THERE ARE NO DATASOURCES REGISTERED") for m in mutations: # If the alt_allele_seen annotation is present and False, skip this mutation if self._is_skip_no_alts and m.get("alt_allele_seen", "True") == "False": continue annot_dict = None if self._cacheManager is not None: annot_dict = self._cacheManager.retrieve_cached_annotations(m) if annot_dict is None: for datasource in self._datasources: m = datasource.annotate_mutation(m) if self._cacheManager is not None: self._cache_stats['miss'] += 1 self._cacheManager.store_annotations_in_cache(m) else: self._cache_stats['hit'] += 1 m.addAnnotations(annot_dict) yield m