def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there is a set for each subclass of the input iri. Write these lists to concord files, indexed by the prefix""" prefix = Text.get_curie(iri) types2relations={'xref':'xref', 'exact': 'oio:exactMatch', 'close': 'oio:closeMatch'} if set_type not in types2relations: return uber = UberGraph() if set_type == 'xref': uberres = uber.get_subclasses_and_xrefs(iri) elif set_type == 'exact': uberres = uber.get_subclasses_and_exacts(iri) elif set_type == 'close': uberres = uber.get_subclasses_and_close(iri) for k,v in uberres.items(): if not hop_ontologies: subclass_prefix = Text.get_curie(k) if subclass_prefix != prefix: continue v = set([ norm(x,other_prefixes) for x in v ]) for x in v: if Text.get_curie(x) not in ignore_list: p = Text.get_curie(k) if p in concordfiles: concordfiles[p].write(f'{k}\t{types2relations[set_type]}\t{x}\n')
def get_subclasses_and_xrefs(self,iri): text=""" prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix CL: <http://purl.obolibrary.org/obo/CL_> prefix GO: <http://purl.obolibrary.org/obo/GO_> prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> prefix HP: <http://purl.obolibrary.org/obo/HP_> prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_> prefix PR: <http://purl.obolibrary.org/obo/PR_> select distinct ?descendent ?xref from <http://reasoner.renci.org/nonredundant> from <http://reasoner.renci.org/ontology> where { graph <http://reasoner.renci.org/ontology/closure> { ?descendent rdfs:subClassOf $sourcedefclass . } ?descendent <http://www.geneontology.org/formats/oboInOwl#hasDbXref> ?xref . } """ resultmap = self.triplestore.query_template( inputs = { 'sourcedefclass': iri }, \ outputs = [ 'descendent', 'xref' ], \ template_text = text \ ) results = defaultdict(set) for row in resultmap: dcurie = Text.opt_to_curie(row['descendent']) #Sometimes we're getting back just strings that aren't curies, skip those (but complain) if ':' not in row['xref']: print(f'Bad XREF from {row["descendent"]} to {row["xref"]}') continue results[ dcurie ].add( (Text.opt_to_curie(row['xref']) )) return results
def get_prefixes(idlist): prefs = defaultdict(list) for ident in idlist: if isinstance(ident, LabeledID): print('nonono') exit() prefs.add(Text.get_curie(ident.identifier)) else: prefs[Text.get_curie(ident)].append(ident) return prefs
def get_subclasses_and_exacts(self,iri): text=lambda predicate: f""" prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix CL: <http://purl.obolibrary.org/obo/CL_> prefix GO: <http://purl.obolibrary.org/obo/GO_> prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> prefix HP: <http://purl.obolibrary.org/obo/HP_> prefix EFO: <http://www.ebi.ac.uk/efo/EFO_> prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_> PREFIX EXACT_MATCH: <http://www.w3.org/2004/02/skos/core#exactMatch> PREFIX M_EXACT_MATCH: <http://purl.obolibrary.org/obo/mondo#exactMatch> PREFIX EQUIVALENT_CLASS: <http://www.w3.org/2002/07/owl#equivalentClass> PREFIX ID: <http://www.geneontology.org/formats/oboInOwl#id> SELECT DISTINCT ?descendent ?match FROM <http://reasoner.renci.org/ontology> WHERE {{ graph <http://reasoner.renci.org/ontology/closure> {{ ?descendent rdfs:subClassOf $identifier . }} OPTIONAL {{ ?descendent {predicate} ?match. }} }} """ resultmap = self.triplestore.query_template( template_text=text('EXACT_MATCH:'), inputs={ 'identifier': iri }, outputs=[ 'descendent', 'match' ] ) resultmap += self.triplestore.query_template( template_text=text('M_EXACT_MATCH:'), inputs={ 'identifier': iri }, outputs=[ 'descendent', 'match' ] ) resultmap += self.triplestore.query_template( template_text=text('EQUIVALENT_CLASS:'), inputs={ 'identifier': iri }, outputs=[ 'descendent', 'match'] ) results = defaultdict(list) for row in resultmap: desc=Text.opt_to_curie(row['descendent']) if row['match'] is None: results[desc] += [] else: results[ desc ].append( (Text.opt_to_curie(row['match']) )) #Sometimes, if there are no exact_matches, we'll get some kind of blank node id # like 't19830198'. Want to filter those out. for k,v in results.items(): results[k] = list(filter(lambda x: ':' in x, v)) return results
def get_subclasses_of(self,iri): text=""" prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix CL: <http://purl.obolibrary.org/obo/CL_> prefix GO: <http://purl.obolibrary.org/obo/GO_> prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> prefix HP: <http://purl.obolibrary.org/obo/HP_> prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_> prefix PR: <http://purl.obolibrary.org/obo/PR_> prefix EFO: <http://www.ebi.ac.uk/efo/EFO_> select distinct ?descendent ?descendentLabel from <http://reasoner.renci.org/ontology> where { graph <http://reasoner.renci.org/ontology/closure> { ?descendent rdfs:subClassOf $sourcedefclass . } OPTIONAL { ?descendent rdfs:label ?descendentLabel . } } """ rr = self.triplestore.query_template( inputs = { 'sourcedefclass': iri }, \ outputs = [ 'descendent', 'descendentLabel' ], \ template_text = text \ ) results = [] for x in rr: y = {} y['descendent'] = Text.opt_to_curie(x['descendent']) y['descendentLabel'] = x['descendentLabel'] results.append(y) return results
def get_all_synonyms(self): text = """ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix owl: <http://www.w3.org/2002/07/owl#> prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix CL: <http://purl.obolibrary.org/obo/CL_> prefix GO: <http://purl.obolibrary.org/obo/GO_> prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> prefix HP: <http://purl.obolibrary.org/obo/HP_> prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_> SELECT ?cls ?pred ?val from <http://reasoner.renci.org/ontology> WHERE { ?cls ?pred ?val ; a owl:Class . FILTER ( ?pred = oboInOwl:hasRelatedSynonym || ?pred = oboInOwl:hasNarrowSynonym || ?pred = oboInOwl:hasBroadSynonym || ?pred = oboInOwl:hasExactSynonym ) } """ rr = self.triplestore.query_template( inputs={}, \ outputs=['cls', 'pred', 'val'], \ template_text=text \ ) results = [] for x in rr: y = ( Text.opt_to_curie(x['cls']), x['pred'], x['val']) results.append(y) return results
def get_all_labels(self): text = """ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_> prefix CL: <http://purl.obolibrary.org/obo/CL_> prefix GO: <http://purl.obolibrary.org/obo/GO_> prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_> prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_> prefix HP: <http://purl.obolibrary.org/obo/HP_> prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_> select distinct ?thing ?label from <http://reasoner.renci.org/ontology> where { ?thing rdfs:label ?label . } """ rr = self.triplestore.query_template( inputs={}, \ outputs=['thing', 'label'], \ template_text=text \ ) results = [] for x in rr: y = {} y['iri'] = Text.opt_to_curie(x['thing']) y['label'] = x['label'] results.append(y) return results
def makecountset(j): eids = [ Text.get_curie(x['identifier']) for x in j['equivalent_identifiers'] ] pcounts = defaultdict(int) for p in eids: pcounts[p] += 1 return frozenset([(k, v) for k, v in pcounts.items()])
def get_synonyms(self, node): node_synonyms = set() for ident in node['equivalent_identifiers']: thisid = ident['identifier'] pref = Text.get_curie(thisid) if not pref in self.synonyms: self.load_synonyms(pref) node_synonyms.update(self.synonyms[pref][thisid]) return node_synonyms
def write_obo_ids(irisandtypes, outfile, order, exclude=[]): uber = UberGraph() iris_to_types = defaultdict(set) for iri, ntype in irisandtypes: uberres = uber.get_subclasses_of(iri) for k in uberres: iris_to_types[k['descendent']].add(ntype) excludes = [] for excluded_iri in exclude: excludes += uber.get_subclasses_of(excluded_iri) excluded_iris = set([k['descendent'] for k in excludes]) prefix = Text.get_curie(iri) with open(outfile, 'w') as idfile: for kd, typeset in iris_to_types.items(): if kd not in excluded_iris and kd.startswith(prefix): l = list(typeset) l.sort(key=lambda k: order.index(k)) idfile.write(f'{kd}\t{l[0]}\n')
def apply_labels(self, input_identifiers, labels): #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. # But now, we only allow regular ids in the list, and now we need to turn some of them into labeled ids for output labeled_list = [] for iid in input_identifiers: if isinstance(iid, LabeledID): print('LabeledID dont belong here, pass in labels seperately', iid) exit() if iid in labels: labeled_list.append( LabeledID(identifier=iid, label=labels[iid])) else: prefix = Text.get_prefix(iid) if prefix not in self.extra_labels: self.load_extra_labels(prefix) if iid in self.extra_labels[prefix]: labeled_list.append( LabeledID(identifier=iid, label=self.extra_labels[prefix][iid])) else: labeled_list.append(iid) return labeled_list
def create_node(self, input_identifiers, node_type, labels={}): #This is where we will normalize, i.e. choose the best id, and add types in accord with BL. #we should also include provenance and version information for the node set build. ancestors = self.get_ancestors(node_type) #ancestors.reverse() prefixes = self.get_prefixes(node_type) if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: print('this seems like a lot') print(len(input_identifiers)) cleaned = self.apply_labels(input_identifiers, labels) try: idmap = defaultdict(list) for i in list(cleaned): idmap[Text.get_curie(i).upper()].append(i) except AttributeError: print('something very bad') print(input_identifiers) print(len(input_identifiers)) for i in list(input_identifiers): print(i) print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) exit() identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts #In order to be consistent from run to run, we need to worry about the # case where e.g. there are 2 UMLS id's and UMLS is the preferred pref. # We're going to choose the canonical ID here just by sorting the N . for p in prefixes: pupper = p.upper() if pupper in idmap: newids = [] for v in idmap[pupper]: newid = Text.recurie(v, p) jid = self.make_json_id(newid) newids.append((jid['identifier'], jid)) accepted_ids.add(v) newids.sort() identifiers += [nid[1] for nid in newids] #Warn if we have prefixes that we're ignoring for k, vals in idmap.items(): for v in vals: if v not in accepted_ids and ( k, node_type) not in self.ignored_prefixes: print( f'Ignoring prefix {k} for type {node_type}, identifier {v}' ) self.ignored_prefixes.add((k, node_type)) if len(identifiers) == 0: return None best_id = identifiers[0]['identifier'] # identifiers is in preferred order, so choose the first non-empty label to be the node label labels = list( filter(lambda x: len(x) > 0, [l['label'] for l in identifiers if 'label' in l])) label = None if len(labels) > 0: label = labels[0] node = { 'id': { 'identifier': best_id, }, 'equivalent_identifiers': identifiers, 'type': ancestors } if label is not None: node['id']['label'] = label return node
def norm(x, op): #Get curie returns the uppercase pref = Text.get_curie(x) if pref in op: return Text.recurie(x, op[pref]) return x
def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'], pref='HP', close={}): """We want to construct sets containing equivalent identifiers. conc_set is a dictionary where the values are these equivalent identifier sets and the keys are all of the elements in the set. For each element in a set, there is a key in the dictionary that points to the set. newgroups is an iterable that of new equivalence groups (expressed as sets,tuples,or lists) with which we want to update conc_set.""" n = 0 bad = 0 shit_prefixes = set(['KEGG', 'PUBCHEM']) test_id = 'xUBERON:0002262' excised = set() for xgroup in newgroups: if isinstance(xgroup, frozenset): group = set(xgroup) else: group = xgroup #As of now, xgroup should never be more than two things if len(xgroup) > 2: print(xgroup) print('nope nope nope') exit() n += 1 if test_id in group: print('higroup', group) #Find all the equivalence sets that already correspond to any of the identifiers in the new set. existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set] #All of these sets are now going to be combined through the equivalence of our new set. existing_sets = [es[0] for es in existing_sets_w_x] x = [es[1] for es in existing_sets_w_x] newset = set().union(*existing_sets) #put all the new stuff in it. Do it element-wise, cause we don't know the type of the new group for element in group: newset.add(element) if test_id in newset: print('hiset', newset) print('input_set', group) print('esets') for eset in existing_sets: print(' ', eset, group.intersection(eset)) for check_element in newset: prefix = check_element.split(':')[0] if prefix in shit_prefixes: print(prefix) print(check_element) raise Exception('garbage') #make sure we didn't combine anything we want to keep separate setok = True if test_id in group: print('setok?', setok) for up in unique_prefixes: if test_id in group: print('up?', up) idents = [e if type(e) == str else e.identifier for e in newset] if len(set([e for e in idents if (e.split(':')[0] == up)])) > 1: bad += 1 setok = False wrote = set() for s in existing_sets: fs = frozenset(s) wrote.add(fs) for gel in group: if Text.get_curie(gel) == pref: killer = gel #for preset in wrote: # print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n') #print('------------') if not setok: #Our new group created a new set that merged stuff we didn't want to merge. #Previously we did a lot of fooling around at this point. But now we're just going to say, I have a # pairwise concordance. That can at most link two groups. just don't link them. In other words, # we are simply ignoring this concordance. continue #Let's figure out the culprit(s) and excise them #counts = defaultdict(int) #for x in group: # counts[x] += 1 ##THe way existing sets was created, means that the same set can be in there twice, and we don't want to # count things that way #unique_existing_sets = [] #for ex in existing_sets: # u = True # for q in unique_existing_sets: # if ex == q: # u = False # if u: # unique_existing_sets.append(ex) #for es in unique_existing_sets: # for y in es: # counts[y] += 1 #bads = [ x for x,y in counts.items() if y > 1 ] #now we know which identifiers are causing trouble. #We don't want to completely throw them out, but we can't allow them to gum things up. #So, we need to first remove them from all the sets, then we need to put them in their own set #It might be good to track this somehow? #excised.update(bads) #for b in bads: # if b in group: # group.remove(b) # for exset in existing_sets: # if b in exset: # exset.remove(b) # conc_set[b] = set([b]) #for x in group: # conc_set[x] = group #continue #Now check the 'close' dictionary to see if we've accidentally gotten to a close match becoming an exact match setok = True for cpref, closedict in close.items(): idents = set( [e if type(e) == str else e.identifier for e in newset]) prefidents = [e for e in idents if e.startswith(cpref)] for pident in prefidents: for cd in closedict[pident]: if cd in newset: setok = False if len(prefidents) == 0: continue if not setok: continue #Now make all the elements point to this new set: for element in newset: conc_set[element] = newset