コード例 #1
0
def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ):
    """Given an IRI create a list of sets.  Each set is a set of equivalent LabeledIDs, and there
    is a set for each subclass of the input iri.  Write these lists to concord files, indexed by the prefix"""
    prefix = Text.get_curie(iri)
    types2relations={'xref':'xref', 'exact': 'oio:exactMatch', 'close': 'oio:closeMatch'}
    if set_type not in types2relations:
        return
    uber = UberGraph()
    if set_type == 'xref':
        uberres = uber.get_subclasses_and_xrefs(iri)
    elif set_type == 'exact':
        uberres = uber.get_subclasses_and_exacts(iri)
    elif set_type == 'close':
        uberres = uber.get_subclasses_and_close(iri)
    for k,v in uberres.items():
        if not hop_ontologies:
            subclass_prefix = Text.get_curie(k)
            if subclass_prefix != prefix:
                continue
        v = set([ norm(x,other_prefixes) for x in v ])
        for x in v:
            if Text.get_curie(x) not in ignore_list:
                p = Text.get_curie(k)
                if p in concordfiles:
                    concordfiles[p].write(f'{k}\t{types2relations[set_type]}\t{x}\n')
コード例 #2
0
 def get_subclasses_and_xrefs(self,iri):
     text="""
     prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_>
     prefix CL: <http://purl.obolibrary.org/obo/CL_>
     prefix GO: <http://purl.obolibrary.org/obo/GO_>
     prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
     prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
     prefix HP: <http://purl.obolibrary.org/obo/HP_>
     prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_>
     prefix PR: <http://purl.obolibrary.org/obo/PR_>
     select distinct ?descendent ?xref 
     from <http://reasoner.renci.org/nonredundant>
     from <http://reasoner.renci.org/ontology>
     where {
       graph <http://reasoner.renci.org/ontology/closure> {
             ?descendent rdfs:subClassOf $sourcedefclass .
       }  
       ?descendent <http://www.geneontology.org/formats/oboInOwl#hasDbXref> ?xref .
     }
     """
     resultmap = self.triplestore.query_template(
         inputs  = { 'sourcedefclass': iri  }, \
         outputs = [ 'descendent', 'xref' ], \
         template_text = text \
     )
     results = defaultdict(set)
     for row in resultmap:
         dcurie = Text.opt_to_curie(row['descendent'])
         #Sometimes we're getting back just strings that aren't curies, skip those (but complain)
         if ':' not in row['xref']:
             print(f'Bad XREF from {row["descendent"]} to {row["xref"]}')
             continue
         results[ dcurie ].add( (Text.opt_to_curie(row['xref']) ))
     return results
コード例 #3
0
def get_prefixes(idlist):
    prefs = defaultdict(list)
    for ident in idlist:
        if isinstance(ident, LabeledID):
            print('nonono')
            exit()
            prefs.add(Text.get_curie(ident.identifier))
        else:
            prefs[Text.get_curie(ident)].append(ident)
    return prefs
コード例 #4
0
 def get_subclasses_and_exacts(self,iri):
     text=lambda predicate: f"""
     prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_>
     prefix CL: <http://purl.obolibrary.org/obo/CL_>
     prefix GO: <http://purl.obolibrary.org/obo/GO_>
     prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
     prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
     prefix HP: <http://purl.obolibrary.org/obo/HP_>
     prefix EFO: <http://www.ebi.ac.uk/efo/EFO_>
     prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_>
     PREFIX EXACT_MATCH: <http://www.w3.org/2004/02/skos/core#exactMatch>
     PREFIX M_EXACT_MATCH: <http://purl.obolibrary.org/obo/mondo#exactMatch>
     PREFIX EQUIVALENT_CLASS: <http://www.w3.org/2002/07/owl#equivalentClass>
     PREFIX ID: <http://www.geneontology.org/formats/oboInOwl#id>
     SELECT DISTINCT ?descendent ?match
     FROM <http://reasoner.renci.org/ontology>
     WHERE {{
         graph <http://reasoner.renci.org/ontology/closure> {{
             ?descendent rdfs:subClassOf $identifier .
         }}
         OPTIONAL {{
             ?descendent {predicate} ?match.      
         }} 
     }}
     """
     resultmap = self.triplestore.query_template(
            template_text=text('EXACT_MATCH:'),
            inputs={
                'identifier': iri
            }, outputs=[ 'descendent', 'match' ] )
     resultmap += self.triplestore.query_template(
            template_text=text('M_EXACT_MATCH:'),
            inputs={
                'identifier': iri
            }, outputs=[ 'descendent', 'match' ] )
     resultmap += self.triplestore.query_template(
             template_text=text('EQUIVALENT_CLASS:'),
             inputs={
                 'identifier': iri
             }, outputs=[ 'descendent', 'match'] )
     results = defaultdict(list)
     for row in resultmap:
         desc=Text.opt_to_curie(row['descendent'])
         if row['match'] is None:
             results[desc] += []
         else:
             results[ desc ].append( (Text.opt_to_curie(row['match']) ))
     #Sometimes, if there are no exact_matches, we'll get some kind of blank node id
     # like 't19830198'. Want to filter those out.
     for k,v in results.items():
         results[k] = list(filter(lambda x: ':' in x, v))
     return results
コード例 #5
0
 def get_subclasses_of(self,iri):
     text="""
     prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_>
     prefix CL: <http://purl.obolibrary.org/obo/CL_>
     prefix GO: <http://purl.obolibrary.org/obo/GO_>
     prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
     prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
     prefix HP: <http://purl.obolibrary.org/obo/HP_>
     prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_>
     prefix PR: <http://purl.obolibrary.org/obo/PR_>
     prefix EFO: <http://www.ebi.ac.uk/efo/EFO_>
     select distinct ?descendent ?descendentLabel
     from <http://reasoner.renci.org/ontology>
     where {
         graph <http://reasoner.renci.org/ontology/closure> {
             ?descendent rdfs:subClassOf $sourcedefclass .
         }
         OPTIONAL {
             ?descendent rdfs:label ?descendentLabel .
         }
     }
     """
     rr = self.triplestore.query_template(
         inputs  = { 'sourcedefclass': iri  }, \
         outputs = [ 'descendent', 'descendentLabel' ], \
         template_text = text \
     )
     results = []
     for x in rr:
         y = {}
         y['descendent'] = Text.opt_to_curie(x['descendent'])
         y['descendentLabel'] = x['descendentLabel']
         results.append(y)
     return results
コード例 #6
0
 def get_all_synonyms(self):
     text = """
             prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
             prefix owl: <http://www.w3.org/2002/07/owl#>
             prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
             prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_>
             prefix CL: <http://purl.obolibrary.org/obo/CL_>
             prefix GO: <http://purl.obolibrary.org/obo/GO_>
             prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
             prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
             prefix HP: <http://purl.obolibrary.org/obo/HP_>
             prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_>
             SELECT ?cls ?pred ?val
             from <http://reasoner.renci.org/ontology>
             WHERE 
             { ?cls ?pred ?val ;
                 a owl:Class .
                 FILTER (
                 ?pred = oboInOwl:hasRelatedSynonym ||
                 ?pred = oboInOwl:hasNarrowSynonym ||
                 ?pred = oboInOwl:hasBroadSynonym ||
                 ?pred = oboInOwl:hasExactSynonym
                 )
             }
             """
     rr = self.triplestore.query_template(
         inputs={}, \
         outputs=['cls', 'pred', 'val'], \
         template_text=text \
         )
     results = []
     for x in rr:
         y = ( Text.opt_to_curie(x['cls']), x['pred'], x['val'])
         results.append(y)
     return results
コード例 #7
0
 def get_all_labels(self):
     text = """
             prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
             prefix UBERON: <http://purl.obolibrary.org/obo/UBERON_>
             prefix CL: <http://purl.obolibrary.org/obo/CL_>
             prefix GO: <http://purl.obolibrary.org/obo/GO_>
             prefix CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
             prefix MONDO: <http://purl.obolibrary.org/obo/MONDO_>
             prefix HP: <http://purl.obolibrary.org/obo/HP_>
             prefix NCIT: <http://purl.obolibrary.org/obo/NCIT_>
             select distinct ?thing ?label
             from <http://reasoner.renci.org/ontology>
             where {
                 ?thing rdfs:label ?label .
             }
             """
     rr = self.triplestore.query_template(
         inputs={}, \
         outputs=['thing', 'label'], \
         template_text=text \
         )
     results = []
     for x in rr:
         y = {}
         y['iri'] = Text.opt_to_curie(x['thing'])
         y['label'] = x['label']
         results.append(y)
     return results
コード例 #8
0
def makecountset(j):
    eids = [
        Text.get_curie(x['identifier']) for x in j['equivalent_identifiers']
    ]
    pcounts = defaultdict(int)
    for p in eids:
        pcounts[p] += 1
    return frozenset([(k, v) for k, v in pcounts.items()])
コード例 #9
0
 def get_synonyms(self, node):
     node_synonyms = set()
     for ident in node['equivalent_identifiers']:
         thisid = ident['identifier']
         pref = Text.get_curie(thisid)
         if not pref in self.synonyms:
             self.load_synonyms(pref)
         node_synonyms.update(self.synonyms[pref][thisid])
     return node_synonyms
コード例 #10
0
ファイル: obo.py プロジェクト: TranslatorSRI/Babel
def write_obo_ids(irisandtypes, outfile, order, exclude=[]):
    uber = UberGraph()
    iris_to_types = defaultdict(set)
    for iri, ntype in irisandtypes:
        uberres = uber.get_subclasses_of(iri)
        for k in uberres:
            iris_to_types[k['descendent']].add(ntype)
    excludes = []
    for excluded_iri in exclude:
        excludes += uber.get_subclasses_of(excluded_iri)
    excluded_iris = set([k['descendent'] for k in excludes])
    prefix = Text.get_curie(iri)
    with open(outfile, 'w') as idfile:
        for kd, typeset in iris_to_types.items():
            if kd not in excluded_iris and kd.startswith(prefix):
                l = list(typeset)
                l.sort(key=lambda k: order.index(k))
                idfile.write(f'{kd}\t{l[0]}\n')
コード例 #11
0
 def apply_labels(self, input_identifiers, labels):
     #Originally we needed to clean up the identifer lists, because there would be both labeledids and
     # string ids and we had to reconcile them.
     # But now, we only allow regular ids in the list, and now we need to turn some of them into labeled ids for output
     labeled_list = []
     for iid in input_identifiers:
         if isinstance(iid, LabeledID):
             print('LabeledID dont belong here, pass in labels seperately',
                   iid)
             exit()
         if iid in labels:
             labeled_list.append(
                 LabeledID(identifier=iid, label=labels[iid]))
         else:
             prefix = Text.get_prefix(iid)
             if prefix not in self.extra_labels:
                 self.load_extra_labels(prefix)
             if iid in self.extra_labels[prefix]:
                 labeled_list.append(
                     LabeledID(identifier=iid,
                               label=self.extra_labels[prefix][iid]))
             else:
                 labeled_list.append(iid)
     return labeled_list
コード例 #12
0
    def create_node(self, input_identifiers, node_type, labels={}):
        #This is where we will normalize, i.e. choose the best id, and add types in accord with BL.
        #we should also include provenance and version information for the node set build.
        ancestors = self.get_ancestors(node_type)
        #ancestors.reverse()
        prefixes = self.get_prefixes(node_type)
        if len(input_identifiers) == 0:
            return None
        if len(input_identifiers) > 1000:
            print('this seems like a lot')
            print(len(input_identifiers))
        cleaned = self.apply_labels(input_identifiers, labels)
        try:
            idmap = defaultdict(list)
            for i in list(cleaned):
                idmap[Text.get_curie(i).upper()].append(i)
        except AttributeError:
            print('something very bad')
            print(input_identifiers)
            print(len(input_identifiers))
            for i in list(input_identifiers):
                print(i)
                print(type(i))
                print(Text.get_curie(i))
                print(Text.get_curie(i).upper())
            exit()
        identifiers = []
        accepted_ids = set()
        #Converting identifiers from LabeledID to dicts
        #In order to be consistent from run to run, we need to worry about the
        # case where e.g. there are 2 UMLS id's and UMLS is the preferred pref.
        # We're going to choose the canonical ID here just by sorting the N .
        for p in prefixes:
            pupper = p.upper()
            if pupper in idmap:
                newids = []
                for v in idmap[pupper]:
                    newid = Text.recurie(v, p)
                    jid = self.make_json_id(newid)
                    newids.append((jid['identifier'], jid))
                    accepted_ids.add(v)
                newids.sort()
                identifiers += [nid[1] for nid in newids]
        #Warn if we have prefixes that we're ignoring
        for k, vals in idmap.items():
            for v in vals:
                if v not in accepted_ids and (
                        k, node_type) not in self.ignored_prefixes:
                    print(
                        f'Ignoring prefix {k} for type {node_type}, identifier {v}'
                    )
                    self.ignored_prefixes.add((k, node_type))
        if len(identifiers) == 0:
            return None
        best_id = identifiers[0]['identifier']
        # identifiers is in preferred order, so choose the first non-empty label to be the node label
        labels = list(
            filter(lambda x: len(x) > 0,
                   [l['label'] for l in identifiers if 'label' in l]))
        label = None
        if len(labels) > 0:
            label = labels[0]

        node = {
            'id': {
                'identifier': best_id,
            },
            'equivalent_identifiers': identifiers,
            'type': ancestors
        }
        if label is not None:
            node['id']['label'] = label
        return node
コード例 #13
0
def norm(x, op):
    #Get curie returns the uppercase
    pref = Text.get_curie(x)
    if pref in op:
        return Text.recurie(x, op[pref])
    return x
コード例 #14
0
def glom(conc_set,
         newgroups,
         unique_prefixes=['INCHIKEY'],
         pref='HP',
         close={}):
    """We want to construct sets containing equivalent identifiers.
    conc_set is a dictionary where the values are these equivalent identifier sets and
    the keys are all of the elements in the set.   For each element in a set, there is a key
    in the dictionary that points to the set.
    newgroups is an iterable that of new equivalence groups (expressed as sets,tuples,or lists)
    with which we want to update conc_set."""
    n = 0
    bad = 0
    shit_prefixes = set(['KEGG', 'PUBCHEM'])
    test_id = 'xUBERON:0002262'
    excised = set()
    for xgroup in newgroups:
        if isinstance(xgroup, frozenset):
            group = set(xgroup)
        else:
            group = xgroup
        #As of now, xgroup should never be more than two things
        if len(xgroup) > 2:
            print(xgroup)
            print('nope nope nope')
            exit()
        n += 1
        if test_id in group:
            print('higroup', group)
        #Find all the equivalence sets that already correspond to any of the identifiers in the new set.
        existing_sets_w_x = [(conc_set[x], x) for x in group if x in conc_set]
        #All of these sets are now going to be combined through the equivalence of our new set.
        existing_sets = [es[0] for es in existing_sets_w_x]
        x = [es[1] for es in existing_sets_w_x]
        newset = set().union(*existing_sets)
        #put all the new stuff in it.  Do it element-wise, cause we don't know the type of the new group
        for element in group:
            newset.add(element)
        if test_id in newset:
            print('hiset', newset)
            print('input_set', group)
            print('esets')
            for eset in existing_sets:
                print(' ', eset, group.intersection(eset))
        for check_element in newset:
            prefix = check_element.split(':')[0]
            if prefix in shit_prefixes:
                print(prefix)
                print(check_element)
                raise Exception('garbage')
        #make sure we didn't combine anything we want to keep separate
        setok = True
        if test_id in group:
            print('setok?', setok)
        for up in unique_prefixes:
            if test_id in group:
                print('up?', up)
            idents = [e if type(e) == str else e.identifier for e in newset]
            if len(set([e for e in idents if (e.split(':')[0] == up)])) > 1:
                bad += 1
                setok = False
                wrote = set()
                for s in existing_sets:
                    fs = frozenset(s)
                    wrote.add(fs)
                for gel in group:
                    if Text.get_curie(gel) == pref:
                        killer = gel
                #for preset in wrote:
                #    print(f'{killer}\t{set(group).intersection(preset)}\t{preset}\n')
                #print('------------')
        if not setok:
            #Our new group created a new set that merged stuff we didn't want to merge.
            #Previously we did a lot of fooling around at this point.  But now we're just going to say, I have a
            # pairwise concordance.  That can at most link two groups.  just don't link them. In other words,
            # we are simply ignoring this concordance.
            continue
            #Let's figure out the culprit(s) and excise them
            #counts = defaultdict(int)
            #for x in group:
            #    counts[x] += 1
            ##THe way existing sets was created, means that the same set can be in there twice, and we don't want to
            # count things that way
            #unique_existing_sets = []
            #for ex in existing_sets:
            #    u = True
            #    for q in unique_existing_sets:
            #        if ex == q:
            #            u = False
            #    if u:
            #        unique_existing_sets.append(ex)
            #for es in unique_existing_sets:
            #    for y in es:
            #        counts[y] += 1
            #bads = [ x for x,y in counts.items() if y > 1 ]
            #now we know which identifiers are causing trouble.
            #We don't want to completely throw them out, but we can't allow them to gum things up.
            #So, we need to first remove them from all the sets, then we need to put them in their own set
            #It might be good to track this somehow?
            #excised.update(bads)
            #for b in bads:
            #    if b in group:
            #        group.remove(b)
            #    for exset in existing_sets:
            #        if b in exset:
            #            exset.remove(b)
            #    conc_set[b] = set([b])
            #for x in group:
            #    conc_set[x] = group
            #continue
        #Now check the 'close' dictionary to see if we've accidentally gotten to a close match becoming an exact match
        setok = True
        for cpref, closedict in close.items():
            idents = set(
                [e if type(e) == str else e.identifier for e in newset])
            prefidents = [e for e in idents if e.startswith(cpref)]
            for pident in prefidents:
                for cd in closedict[pident]:
                    if cd in newset:
                        setok = False
            if len(prefidents) == 0:
                continue
        if not setok:
            continue
        #Now make all the elements point to this new set:
        for element in newset:
            conc_set[element] = newset