def all_synonyms(self, include_label=False): logging.debug("Fetching all syns...") # TODO: include_label in cache if self.all_synonyms_cache == None: syntups = fetchall_syns(self.graph_name) syns = [Synonym(t[0],pred=t[1], val=t[2]) for t in syntups] for syn in syns: self.add_synonym(syn) if include_label: lsyns = [Synonym(x, pred='label', val=self.label(x)) for x in self.nodes()] syns = syns + lsyns self.all_synonyms_cache = syns # TODO: check if still used return self.all_synonyms_cache
def index_ontology(self, ont): """ Adds an ontology to the index This iterates through all labels and synonyms in the ontology, creating an index """ self.merged_ontology.merge([ont]) syns = ont.all_synonyms(include_label=True) include_id = self._is_meaningful_ids() logging.info("Include IDs as synonyms: {}".format(include_id)) if include_id: for n in ont.nodes(): v = n # Get fragment if v.startswith('http'): v = re.sub('.*/', '', v) v = re.sub('.*#', '', v) syns.append(Synonym(n, val=v, pred='label')) logging.info("Indexing {} syns in {}".format(len(syns), ont)) logging.info("Distinct lexical values: {}".format(len( self.lmap.keys()))) for syn in syns: self.index_synonym(syn, ont) for nid in ont.nodes(): self.id_to_ontology_map[nid].append(ont)
def index_synonym(self, syn, ont): """ Index a synonym Typically not called from outside this object; called by `index_ontology` """ if not syn.val: if syn.pred == 'label': if not self._is_meaningful_ids(): if not ont.is_obsolete(syn.class_id): pass #logging.error('Use meaningful ids if label not present: {}'.format(syn)) else: logging.warning("Incomplete syn: {}".format(syn)) return if self.exclude_obsolete and ont.is_obsolete(syn.class_id): return syn.ontology = ont prefix, _ = ont.prefix_fragment(syn.class_id) v = syn.val caps_match = re.match('[A-Z]+', v) if caps_match: # if > 75% of length is caps, assume abbreviation if caps_match.span()[1] >= len(v) / 3: syn.is_abbreviation(True) # chebi 'synonyms' are often not real synonyms # https://github.com/ebi-chebi/ChEBI/issues/3294 if not re.match('.*[a-zA-Z]', v): if prefix != 'CHEBI': logging.warning('Ignoring suspicous synonym: {}'.format(syn)) return v = self._standardize_label(v) # TODO: do this once ahead of time wsmap = {} for w, s in self.wsmap.items(): wsmap[w] = s for ss in self._get_config_val(prefix, 'synsets', []): # TODO: weights wsmap[ss['synonym']] = ss['word'] nv = self._normalize_label(v, wsmap) self._index_synonym_val(syn, v) nweight = self._get_config_val(prefix, 'normalized_form_confidence', 0.8) if nweight > 0 and not syn.is_abbreviation(): if nv != v: nsyn = Synonym(syn.class_id, val=syn.val, pred=syn.pred, lextype=syn.lextype, ontology=ont, confidence=syn.confidence * nweight) self._index_synonym_val(nsyn, nv)
def process_rdfgraph(self, rg, ont=None): """ Transform a skos terminology expressed in an rdf graph into an Ontology object Arguments --------- rg: rdflib.Graph graph object Returns ------- Ontology """ # TODO: ontology metadata if ont is None: ont = Ontology() subjs = list(rg.subjects(RDF.type, SKOS.ConceptScheme)) if len(subjs) == 0: logging.warning("No ConceptScheme") else: ont.id = self._uri2id(subjs[0]) subset_map = {} for concept in rg.subjects(RDF.type, SKOS.Concept): for s in self._get_schemes(rg, concept): subset_map[self._uri2id(s)] = s for concept in sorted(list(rg.subjects(RDF.type, SKOS.Concept))): concept_uri = str(concept) id=self._uri2id(concept) logging.info("ADDING: {}".format(id)) ont.add_node(id, self._get_label(rg,concept)) for defn in rg.objects(concept, SKOS.definition): if (defn.language == self.lang): td = TextDefinition(id, escape_value(defn.value)) ont.add_text_definition(td) for s in rg.objects(concept, SKOS.broader): ont.add_parent(id, self._uri2id(s)) for s in rg.objects(concept, SKOS.related): ont.add_parent(id, self._uri2id(s), self._uri2id(SKOS.related)) for m in rg.objects(concept, SKOS.exactMatch): ont.add_xref(id, self._uri2id(m)) for m in rg.objects(concept, SKOS.altLabel): syn = Synonym(id, val=self._uri2id(m)) ont.add_synonym(syn) for s in self._get_schemes(rg,concept): ont.add_to_subset(id, self._uri2id(s)) return ont
def index_synonym(self, syn, ont): """ Index a synonym Typically not called from outside this object; called by `index_ontology` """ if not syn.val: logging.debug("Incomplete syn: {}".format(syn)) return if self.exclude_obsolete and ont.is_obsolete(syn.class_id): return syn.ontology = ont v = syn.val.lower() nv = self._normalize(v, self.wsmap) if self.nweight > 0: self._index_synonym_val(syn, v) if nv != v: nsyn = Synonym(syn.class_id, val=syn.val, pred=syn.pred, lextype=syn.lextype, ontology=ont) self._index_synonym_val(nsyn, nv)