def _process_ontology_data(self): self.disease_ontology = OntologyClassReader() opentargets_ontologyutils.efo.load_open_targets_disease_ontology( self.disease_ontology, self.efo_uri) ''' Get all phenotypes ''' #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris) utils = DiseaseUtils() disease_phenotypes = utils.get_disease_phenotypes( self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter) for uri, label in self.disease_ontology.current_classes.items(): properties = self.disease_ontology.parse_properties(URIRef(uri)) #create a text block definition/description by joining others together definition = '' if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties: definition = ". ".join( properties['http://purl.obolibrary.org/obo/IAO_0000115']) synonyms = [] if 'http://www.ebi.ac.uk/efo/alternative_term' in properties: synonyms = properties[ 'http://www.ebi.ac.uk/efo/alternative_term'] phenotypes = [] if uri in disease_phenotypes: phenotypes = disease_phenotypes[uri]['phenotypes'] therapeutic_labels = [ item[0] for item in self.disease_ontology.classes_paths[uri]['labels'] ] therapeutic_labels = self._remove_duplicates(therapeutic_labels) efo = EFO( code=uri, label=label, synonyms=synonyms, phenotypes=phenotypes, path=self.disease_ontology.classes_paths[uri]['all'], path_codes=self.disease_ontology.classes_paths[uri]['ids'], path_labels=self.disease_ontology.classes_paths[uri]['labels'], therapeutic_labels=therapeutic_labels, definition=definition) id = self.disease_ontology.classes_paths[uri]['ids'][0][-1] if uri in self.disease_ontology.children: efo.children = self.disease_ontology.children[uri] self.efos[id] = efo
def _get_mp_classes(self, mp_uri): #self._logger.debug("_get_mp_classes") #load the onotology self.mp_ontology = OntologyClassReader() opentargets_ontologyutils.mp.load_mammalian_phenotype_ontology(self.mp_ontology, mp_uri) #TODO this is a moderately hideous bit of pointless munging, but I don't have time fix it now! for mp_id,label in list(self.mp_ontology.current_classes.items()): mp_class = {} mp_class["label"] = label if mp_id not in self.mp_ontology.classes_paths: self._logger.warning("cannot find paths for "+mp_id) continue mp_class["path"] = self.mp_ontology.classes_paths[mp_id]['all'] mp_class["path_codes"] = self.mp_ontology.classes_paths[mp_id]['ids'] mp_id_key = mp_id.split("/")[-1].replace(":", "_") self.mps[mp_id_key] = mp_class self.mp_labels[mp_class["label"]] = mp_id self.mp_to_label[mp_id] = mp_class["label"] paths = [] for path in mp_class["path"]: item = path[0] paths.append(item) self.top_levels[mp_id] = paths
def __init__(self, es_hosts, es_index, es_mappings, es_settings, eco_uri, so_uri, workers_write, queue_write): self.es_hosts = es_hosts self.es_index = es_index self.es_mappings = es_mappings self.es_settings = es_settings self.eco_uri = eco_uri self.so_uri = so_uri self.workers_write = workers_write self.queue_write = queue_write self.ecos = OrderedDict() self.evidence_ontology = OntologyClassReader()
def _process_ontology_data(self): self.disease_ontology = OntologyClassReader() opentargets_ontologyutils.efo.load_open_targets_disease_ontology(self.disease_ontology, self.efo_uri) ''' Get all phenotypes ''' #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris) utils = DiseaseUtils() disease_phenotypes = utils.get_disease_phenotypes(self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter) #for uri,label in self.disease_ontology.current_classes.items(): for uri in self.disease_ontology.classes_paths: #get the short code form of the uri classes_path = self.disease_ontology.classes_paths[uri] id = classes_path['ids'][0][-1] label = classes_path['labels'][0][-1] if uri != classes_path["all"][0][-1]["uri"]: raise RuntimeError('mismatch between uri and classes_path["all"][0][-1]["uri"] %s %s' % (uri, classes_path["all"][0][-1]["uri"])) properties = self.disease_ontology.parse_properties(URIRef(uri)) #create a text block definition/description by joining others together definition = '' if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties: definition = ". ".join(properties['http://purl.obolibrary.org/obo/IAO_0000115']) #build a set of all the relevant synonyms synonyms = set() #exact synonyms if 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasExactSynonym']) #related synonyms (partially overlapping) if 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym']) #generic synoynms if 'http://www.geneontology.org/formats/oboInOwl#hasSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasSynonym']) #narrow synonyms if 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym']) #could have http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym, but that is better captured by parent term phenotypes = [] if uri in disease_phenotypes: phenotypes = disease_phenotypes[uri]['phenotypes'] if uri not in self.disease_ontology.classes_paths: self.logger.warning("Unable to find %s", uri) continue therapeutic_labels = self.disease_ontology.therapeutic_labels[uri] therapeutic_uris = self.disease_ontology.therapeutic_uris[uri] therapeutic_codes = [self.disease_ontology.classes_paths[ta_uri]['ids'][0][-1] for ta_uri in therapeutic_uris] efo = EFO(code=uri, label=label, synonyms=synonyms, phenotypes=phenotypes, path=classes_path['all'], path_codes=classes_path['ids'], path_labels=classes_path['labels'], therapeutic_labels=therapeutic_labels, therapeutic_codes=therapeutic_codes, definition=definition ) if uri in self.disease_ontology.children: efo.children = self.disease_ontology.children[uri] #logger.debug(str(classes_path['ids'])) self.logger.debug("done %s %s %s", id, uri, label) if id in self.efos: self.logger.warning("duplicate %s", id) continue self.efos[id] = efo
class EfoProcess(): def __init__(self, es_hosts, es_index, es_doc, es_mappings, es_settings, efo_uri, hpo_uri, mp_uri, disease_phenotype_uris, workers_write, queue_write ): self.es_hosts = es_hosts self.es_index = es_index self.es_doc = es_doc self.es_mappings = es_mappings self.es_settings = es_settings self.efo_uri = efo_uri self.hpo_uri = hpo_uri self.mp_uri = mp_uri self.disease_phenotype_uris = disease_phenotype_uris self.workers_write = workers_write self.queue_write = queue_write self.efos = OrderedDict() self.logger = logging.getLogger(__name__+".EfoProcess") def process_all(self, dry_run): self._process_ontology_data() self._store_efo(dry_run) def _process_ontology_data(self): self.disease_ontology = OntologyClassReader() opentargets_ontologyutils.efo.load_open_targets_disease_ontology(self.disease_ontology, self.efo_uri) ''' Get all phenotypes ''' #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris) utils = DiseaseUtils() disease_phenotypes = utils.get_disease_phenotypes(self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter) #for uri,label in self.disease_ontology.current_classes.items(): for uri in self.disease_ontology.classes_paths: #get the short code form of the uri classes_path = self.disease_ontology.classes_paths[uri] id = classes_path['ids'][0][-1] label = classes_path['labels'][0][-1] if uri != classes_path["all"][0][-1]["uri"]: raise RuntimeError('mismatch between uri and classes_path["all"][0][-1]["uri"] %s %s' % (uri, classes_path["all"][0][-1]["uri"])) properties = self.disease_ontology.parse_properties(URIRef(uri)) #create a text block definition/description by joining others together definition = '' if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties: definition = ". ".join(properties['http://purl.obolibrary.org/obo/IAO_0000115']) #build a set of all the relevant synonyms synonyms = set() #exact synonyms if 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasExactSynonym']) #related synonyms (partially overlapping) if 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym']) #generic synoynms if 'http://www.geneontology.org/formats/oboInOwl#hasSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasSynonym']) #narrow synonyms if 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' in properties: synonyms.update(properties['http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym']) #could have http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym, but that is better captured by parent term phenotypes = [] if uri in disease_phenotypes: phenotypes = disease_phenotypes[uri]['phenotypes'] if uri not in self.disease_ontology.classes_paths: self.logger.warning("Unable to find %s", uri) continue therapeutic_labels = self.disease_ontology.therapeutic_labels[uri] therapeutic_uris = self.disease_ontology.therapeutic_uris[uri] therapeutic_codes = [self.disease_ontology.classes_paths[ta_uri]['ids'][0][-1] for ta_uri in therapeutic_uris] efo = EFO(code=uri, label=label, synonyms=synonyms, phenotypes=phenotypes, path=classes_path['all'], path_codes=classes_path['ids'], path_labels=classes_path['labels'], therapeutic_labels=therapeutic_labels, therapeutic_codes=therapeutic_codes, definition=definition ) if uri in self.disease_ontology.children: efo.children = self.disease_ontology.children[uri] #logger.debug(str(classes_path['ids'])) self.logger.debug("done %s %s %s", id, uri, label) if id in self.efos: self.logger.warning("duplicate %s", id) continue self.efos[id] = efo def _store_efo(self, dry_run): with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.efos.items(), self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, es, index): self.logger.info("Starting QC") #number of EFO terms efo_term_count = 0 #top level terms (i.e. categories) efo_top_levels = [] #terms without a description efo_missing_description_count = 0 #loop over all efo terms and calculate the metrics #Note: try to avoid doing this more than once! for efo_term in Search().using(es).index(index).query(MatchAll()).scan(): efo_term_count += 1 #path_labels is a list of lists of all paths to the root #top level terms will be those with one list of one item that is itself if len(efo_term["path_labels"]) == 1: if len(efo_term["path_labels"][0]) == 1: efo_top_levels.append(efo_term["label"]) if efo_term["definition"] == None or len(efo_term["definition"].strip()) == 0: efo_missing_description_count += 1 #put the metrics into a single dict metrics = dict() metrics["efo.count"] = efo_term_count metrics["efo.top"] = sorted(efo_top_levels) metrics["efo.top.count"] = len(efo_top_levels) metrics["efo.missing_description.count"] = efo_missing_description_count #return the metrics to the caller so they can write to file or further compare self.logger.info("Finished QC") return metrics
class EfoProcess(): def __init__(self, loader, efo_uri, hpo_uri, mp_uri, disease_phenotype_uris): self.loader = loader self.efos = OrderedDict() self.logger = logging.getLogger(__name__ + ".EfoProcess") self.efo_uri = efo_uri self.hpo_uri = hpo_uri self.mp_uri = mp_uri self.disease_phenotype_uris = disease_phenotype_uris def process_all(self, dry_run): self._process_ontology_data() self._store_efo(dry_run) def _process_ontology_data(self): self.disease_ontology = OntologyClassReader() opentargets_ontologyutils.efo.load_open_targets_disease_ontology( self.disease_ontology, self.efo_uri) ''' Get all phenotypes ''' #becuse of opentargets_ontologyutils for legacy iterates over key,uri pairs disease_phenotype_uris_counter = enumerate(self.disease_phenotype_uris) utils = DiseaseUtils() disease_phenotypes = utils.get_disease_phenotypes( self.disease_ontology, self.hpo_uri, self.mp_uri, disease_phenotype_uris_counter) for uri, label in self.disease_ontology.current_classes.items(): properties = self.disease_ontology.parse_properties(URIRef(uri)) #create a text block definition/description by joining others together definition = '' if 'http://purl.obolibrary.org/obo/IAO_0000115' in properties: definition = ". ".join( properties['http://purl.obolibrary.org/obo/IAO_0000115']) synonyms = [] if 'http://www.ebi.ac.uk/efo/alternative_term' in properties: synonyms = properties[ 'http://www.ebi.ac.uk/efo/alternative_term'] phenotypes = [] if uri in disease_phenotypes: phenotypes = disease_phenotypes[uri]['phenotypes'] therapeutic_labels = [ item[0] for item in self.disease_ontology.classes_paths[uri]['labels'] ] therapeutic_labels = self._remove_duplicates(therapeutic_labels) efo = EFO( code=uri, label=label, synonyms=synonyms, phenotypes=phenotypes, path=self.disease_ontology.classes_paths[uri]['all'], path_codes=self.disease_ontology.classes_paths[uri]['ids'], path_labels=self.disease_ontology.classes_paths[uri]['labels'], therapeutic_labels=therapeutic_labels, definition=definition) id = self.disease_ontology.classes_paths[uri]['ids'][0][-1] if uri in self.disease_ontology.children: efo.children = self.disease_ontology.children[uri] self.efos[id] = efo def _remove_duplicates(self, xs): newlist = [] for item in xs: if item not in newlist: newlist.append(item) return newlist def _store_efo(self, dry_run): #setup elasticsearch if not dry_run: self.loader.create_new_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME) #need to directly get the versioned index name for this function self.loader.prepare_for_bulk_indexing( self.loader.get_versioned_index( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME)) for efo_id, efo_obj in self.efos.items(): if not dry_run: self.loader.put( index_name=Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME, doc_type=Const.ELASTICSEARCH_EFO_LABEL_DOC_NAME, ID=efo_id, body=efo_obj) #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait( Const.ELASTICSEARCH_EFO_LABEL_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing() """ Run a series of QC tests on EFO elasticsearch index. Returns a dictionary of string test names and result objects """ def qc(self, esquery): self.logger.info("Starting QC") #number of EFO terms efo_term_count = 0 #top level terms (i.e. categories) efo_top_levels = [] #terms without a description efo_missing_description_count = 0 #loop over all efo terms and calculate the metrics #Note: try to avoid doing this more than once! for efo_term in esquery.get_all_diseases(): efo_term_count += 1 #path_labels is a list of lists of all paths to the root #top level terms will be those with one list of one item that is itself if len(efo_term["path_labels"]) == 1: if len(efo_term["path_labels"][0]) == 1: efo_top_levels.append(efo_term["label"]) if efo_term["definition"] == None or len( efo_term["definition"].strip()) == 0: efo_missing_description_count += 1 #put the metrics into a single dict metrics = dict() metrics["efo.count"] = efo_term_count metrics["efo.top"] = sorted(efo_top_levels) metrics["efo.top.count"] = len(efo_top_levels) metrics[ "efo.missing_description.count"] = efo_missing_description_count #return the metrics to the caller so they can write to file or further compare self.logger.info("Finished QC") return metrics
def __init__(self, loader, eco_uri, so_uri): self.loader = loader self.ecos = OrderedDict() self.evidence_ontology = OntologyClassReader() self.eco_uri = eco_uri self.so_uri = so_uri