Exemple #1
0
def ncbigene_make():
    IDS_FILE = 'resources/gene-subset-ids.txt'
    with open(IDS_FILE, 'rt') as f:  # this came from neuroNER
        ids = [l.split(':')[1].strip() for l in f.readlines()]

    #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id='
    #for id_ in ids:
    #data = requests.get(url + id_).json()['result'][id_]
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    data = {
        'db': 'gene',
        'retmode': 'json',
        'retmax': 5000,
        'id': None,
    }
    chunks = []
    for i, idset in enumerate(chunk_list(ids, 100)):
        print(i, len(idset))
        data['id'] = ','.join(idset),
        resp = requests.post(url, data=data).json()
        chunks.append(resp)

    base = chunks[0]['result']
    uids = base['uids']
    for more in chunks[1:]:
        data = more['result']
        uids.extend(data['uids'])
        base.update(data)
    #base['uids'] = uids  # i mean... its just the keys
    base.pop('uids')

    ng = createOntology(
        'ncbigeneslim',
        'NIF NCBI Gene subset',
        makePrefixes('ILXREPLACE', 'ilx', 'OBOANN', 'NCBIGene', 'NCBITaxon',
                     'skos', 'owl'),
        'ncbigeneslim',
        'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.'
        % IDS_FILE,
        remote_base='http://ontology.neuinfo.org/NIF/')

    for k, v in base.items():
        #if k != 'uids':
        ncbi(v, ng)
    ng.write()
def chebi_imp():
    PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl',
                            'CHEBI', 'owl', 'skos', 'oboInOwl')
    ug = makeGraph('utilgraph', prefixes=PREFIXES)
    with open('resources/chebi-subset-ids.txt', 'rt') as f:
        ids_raw = set((_.strip() for _ in f.readlines()))
        ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw)))

    def check_chebis(g):
        a = []
        for id_ in ids:
            l = sorted(g.triples((id_, None, None)))
            ll = len(l)
            a.append(ll)
        return a

    def fixIons(g):
        # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to...
        ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI'))
        # atom           ion
        None, 'CHEBI:29108'  # calcium is ok
        ng.replace_uriref('CHEBI:30145', 'CHEBI:49713')  # lithium
        ng.replace_uriref('CHEBI:18248', 'CHEBI:29033')  # iron
        ng.replace_uriref('CHEBI:26216', 'CHEBI:29103')  # potassium
        ng.replace_uriref('CHEBI:26708', 'CHEBI:29101')  # sodium
        None, 'CHEBI:29105'  # zinc is ok

    g = rdflib.Graph()
    cg = rdflib.Graph()
    cd = rdflib.Graph()
    chemg = rdflib.Graph()
    molg = rdflib.Graph()

    #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle')
    cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl',
             format='turtle')
    list(g.add(t) for t in cg)
    a1 = check_chebis(g)

    #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle')
    cd.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl',
             format='turtle')
    list(g.add(t) for t in cd)
    a2 = check_chebis(g)

    #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle')
    chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl',
                format='turtle')
    chemgg = makeGraph('NIF-Chemical', graph=chemg)
    fixIons(chemg)
    list(g.add(t) for t in chemg)
    a3 = check_chebis(g)

    #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle')
    molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl',
               format='turtle')
    molgg = makeGraph('NIF-Molecule', graph=molg)
    fixIons(molg)
    list(g.add(t) for t in molg)
    a4 = check_chebis(g)

    replacedBy = ug.expand('replacedBy:')
    deads = {s: o for s, o in cd.subject_objects(replacedBy)}

    def switch_dead(g):
        ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl'))
        for f, r in deads.items():
            ng.replace_uriref(f, r)
            ng.add_node(r, 'oboInOwl:hasAlternateId',
                        rdflib.Literal(f, datatype=rdflib.XSD.string))
            g.remove(
                (r, replacedBy, r))  # in case the replaced by was already in

    switch_dead(g)
    switch_dead(cg)
    switch_dead(chemg)
    switch_dead(molg)

    def fixHasAltId(g):
        ng = makeGraph('',
                       graph=g,
                       prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN'))
        ng.replace_uriref('NIFCHEM:hasAlternativeId',
                          'oboInOwl:hasAlternativeId')
        ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id')

    list(map(fixHasAltId, (g, cg, chemg)))

    def fixAltIdIsURIRef(g):
        hai = ug.expand('oboInOwl:hasAlternativeId')
        i = ug.expand('oboInOwl:id')
        makeGraph('', graph=g, prefixes=makePrefixes(
            'CHEBI'))  # amazlingly sometimes this is missing...

        def inner(s, p, o):
            if type(o) == rdflib.URIRef:
                qn = g.namespace_manager.qname(o)
                g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string)))
                if 'ns' in qn:
                    print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o),
                          qn)
                g.remove((s, p, o))

        for s, o in g.subject_objects(hai):
            inner(s, hai, o)
        for s, o in g.subject_objects(i):
            inner(s, i, o)

    list(map(fixAltIdIsURIRef, (g, cg, chemg)))

    matches = [_ for _ in zip(a1, a2, a3, a4)]
    changed = [len(set(_)) != 1 for _ in matches]
    review = [(id_, m) for id_, changed, m in zip(ids, changed, matches)
              if changed and m[0]]
    # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    wat_c = [
        set([(s, str(o.toPython()))
             for s, p, o in cg.triples((u, None, None))]) for u, _ in review
    ]
    wat_a = [
        set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))])
        for u, _ in review
    ]
    wat_c_ = [
        set(cg.triples((u, None, None))) for u, _ in review
    ]  # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    wat_a_ = [
        set(g.triples((u, None, None))) for u, _ in review
    ]  # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    diff = [a - c for a, c in zip(wat_a, wat_c)]
    diff_ = [a - c for a, c in zip(wat_a_, wat_c_)]

    cb = createOntology(
        'chebi-bridge',
        'NIF ChEBI bridge',
        makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole',
                     'NIFCHEM', 'oboInOwl', 'NIFMOL', 'OBOANN', 'BIRNANN'),
        'chebibridge',
        ('This bridge file contains additional annotations'
         ' on top of CHEBI identifiers that were originally'
         ' included in NIF-Chemical or NIF-Molecule that have'
         ' not since been added to CHEBI upstream'),
        path='ttl/bridge/',
        #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl',
        #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl'))
        imports=(
            'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl',
            'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl'))

    out = []
    for set_ in diff:
        for sub, string in sorted(set_):
            for t in g.triples((sub, None, None)):
                # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym
                py = t[-1].toPython()
                if py == string and not py.startswith(
                        'ub'
                ):  # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions...
                    cb.add_recursive(t, g)
        cb.add_class(
            sub
        )  # only need to go at the end because sub is the same for each set

    def hasImplicitSuperclass(s, o):
        for super_ in cg.objects(s, rdflib.RDFS.subClassOf):
            if super_ == o:
                return True
            elif hasImplicitSuperclass(super_, o):
                return True

    # curation decisions after review (see outtc for full list)
    curatedOut = []

    def curateOut(*t):
        curatedOut.append(
            tuple(
                ug.expand(_) if type(_) is not rdflib.Literal else _
                for _ in t))
        cb.del_trip(*t)

    curateOut(
        'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367'
    )  # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def
    curateOut(
        'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870'
    )  # some ions may also be free radicals, but all free radicals are not ions!
    #natural product removal since natural product should probably be a role if anything...
    curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243')
    curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243')
    curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243')

    curateOut('CHEBI:50906', 'rdfs:label',
              rdflib.Literal('Chemical role', datatype=rdflib.XSD.string)
              )  # chebi already has a chemical role...
    curateOut(
        'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432'
    )  # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property
    curateOut('CHEBI:22720', 'rdfs:subClassOf',
              'CHEBI:27171')  # not all children are bicyclic
    curateOut(
        'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188'
    )  # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate...
    curateOut(
        'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171'
    )  # not all children are bicyclic, some may be poly, therefore removing
    curateOut(
        'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232'
    )  # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it)
    curateOut('CHEBI:51064', 'rdfs:subClassOf',
              'CHEBI:35338')  # removing since chebi models this with has part
    curateOut(
        'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720'
    )  # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct
    #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786')  # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea

    # review hold over subClassOf statements
    intc = []
    outtc = []
    for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf):
        if str(
                o
        ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class':
            # we need to remove any of the cases where deprecation was misused
            cb.g.remove((s, rdflib.RDFS.subClassOf, o))
        elif hasImplicitSuperclass(s, o):
            cb.g.remove((s, rdflib.RDFS.subClassOf, o))
            intc.append((s, rdflib.RDFS.subClassOf, o))
        else:
            outtc.append((s, rdflib.RDFS.subClassOf, o))

    def qname(trips):
        return tuple(
            tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips)

    for a, p, b in sorted(qname(outtc)):
        if 'NIFMOL' in b:
            continue  # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later
        s = sgv.findById(a)
        o = sgv.findById(b)
        if s is None or o is None:
            print(a, '=>', s)
            print(b, '=>', o)
        else:
            print(s['labels'], s['curie'])
            print('subClassOf')
            print(o['labels'], o['curie'])
            print((a, p, b))
        print('---------------------')

    cb.write(
    )  # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...)

    # validation
    diff2 = set(cb.g) - set(cg)
    diff3 = set(cb.g) - diff2  # should just be all the owl:Class entries
    diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg)  # not informative
    diff5 = set(cb.g) - diff4  # not informative
    both = set(chemg) & set(
        molg)  # there is no overlap beyond the owl:Class declarations

    def getChebis(set_):
        return set(t for t in set_ if 'CHEBI_' in t[0])

    def nodt(graph):
        return set((s, str(o) if type(o) is rdflib.Literal else o)
                   for s, p, o in graph)

    cmc = getChebis((((
        (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) -
                    nodt(curatedOut))
    cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o)))
    mmc = getChebis((((
        (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) -
                    nodt(curatedOut))
    mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o)))

    # remove chebi classes from nifchem and nifmol
    def remstuff(sources, targets):
        for source in sources:
            for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class):
                for target in targets:
                    target.del_class(id_)

    remstuff((cg, cd), (chemgg, molgg))

    chemgg.write()
    molgg.write()

    embed()
Exemple #3
0
def chebi_make():
    PREFIXES = makePrefixes('definition', 'hasRole', 'BFO', 'CHEBI', 'owl',
                            'skos', 'oboInOwl')
    dPREFIXES = makePrefixes('CHEBI', 'replacedBy', 'owl', 'skos')
    ug = makeGraph('utilgraph', prefixes=PREFIXES)

    IDS_FILE = 'resources/chebi-subset-ids.txt'
    with open(IDS_FILE, 'rt') as f:
        ids_raw = set((_.strip() for _ in f.readlines()))
        ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw))

    #gzed = requests.get('http://localhost:8000/chebi.owl')
    #raw = BytesIO(gzed.content)
    gzed = requests.get(
        'http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz'
    )
    raw = BytesIO(gzip.decompress(gzed.content))
    t = etree.parse(raw)
    r = t.getroot()
    cs = r.getchildren()
    classes = [
        _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class'
        and _.values()[0] in ids
    ]
    ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']")
    ops = t.xpath(
        "/*[local-name()='RDF']/*[local-name()='ObjectProperty']")  # TODO
    wanted = [etree.ElementTree(_) for _ in classes]
    rpl_check = t.xpath(
        "/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']"
    )
    rpl_dict = {
        _.text: _.getparent()
        for _ in rpl_check if _.text in ids_raw
    }  # we also need to have any new classes that have replaced old ids
    also_classes = list(rpl_dict.values())

    def rec(start_set, done):
        ids_ = set()
        for c in start_set:
            ids_.update([
                _.items()[0][1] for _ in etree.ElementTree(c).xpath(
                    "/*[local-name()='Class']/*[local-name()='subClassOf']")
                if _.items()
            ])
            ids_.update([
                _.items()[0][1] for _ in etree.ElementTree(c).xpath(
                    "/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']"
                ) if _.items()
            ])
        supers = [
            _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class'
            and _.values()[0] in ids_ and _ not in done
        ]
        if supers:
            msup, mids = rec(supers, done + supers)
            supers += msup
            ids_.update(mids)
        return supers, ids_

    a = ontology + ops + classes + also_classes
    more, mids = rec(a, a)
    all_ = set(a + more)
    r.clear()  # wipe all the stuff we don't need
    for c in all_:
        r.append(c)
    data = etree.tostring(r)

    g = rdflib.Graph()
    g.parse(
        data=data
    )  # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly?

    src_version = list(
        g.query(
            'SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }'
        ))[0][0]

    new_graph = createOntology(
        'chebislim',
        'NIF ChEBI slim',
        PREFIXES,
        'chebislim',
        'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.'
        % (src_version, IDS_FILE),
        remote_base='http://ontology.neuinfo.org/NIF/')

    chebi_dead = createOntology(
        'chebi-dead',
        'NIF ChEBI deprecated',
        dPREFIXES,
        'chebidead',
        'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.'
        % (src_version, IDS_FILE),
        remote_base='http://ontology.neuinfo.org/NIF/')

    depwor = {
        'CHEBI:33243': 'natural product',  # FIXME remove these?
        'CHEBI:36809': 'tricyclic antidepressant',
    }

    for id_ in sorted(
            set(ids_raw) | set((ug.g.namespace_manager.qname(_)
                                for _ in mids))):
        eid = ug.expand(id_)
        trips = list(g.triples((eid, None, None)))
        if not trips:
            #looks for the id_ as a literal
            alts = list(
                g.triples((
                    None,
                    rdflib.term.URIRef(
                        'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId'
                    ),
                    rdflib.Literal(
                        id_,
                        datatype=rdflib.term.URIRef(
                            'http://www.w3.org/2001/XMLSchema#string')))))
            if alts:
                replaced_by, _, __ = alts[0]
                if replaced_by.toPython(
                ) not in ids:  #  we need to add any replacment classes to the bridge
                    print('REPLACED BY NEW CLASS', id_)
                    for t in g.triples((replaced_by, None, None)):
                        new_graph.add_recursive(t, g)
                chebi_dead.add_class(id_)
                chebi_dead.add_node(id_, 'replacedBy:', replaced_by)
                chebi_dead.add_node(id_, rdflib.OWL.deprecated, True)
            else:
                if id_ not in depwor:
                    raise BaseException('wtf error', id_)
        else:
            for trip in trips:
                new_graph.add_recursive(trip, g)

    # https://github.com/ebi-chebi/ChEBI/issues/3294
    madness = new_graph.expand('oboInOwl:hasRelatedSynonym'), rdflib.Literal(
        '0', datatype=rdflib.namespace.XSD.string)
    for a in new_graph.g.subjects(*madness):
        new_graph.g.remove((a, ) + madness)

    new_graph.write()
    chebi_dead.write()
    embed()
    kwargs = {
        'uberon_id':uid,
        'uberon_label':uberon_labs[uid],
        'aba_id':aid,
        'aba_label':abalabs[aid],
        'aba_syns':'\n'.join(sorted(abasyns[aid] + abaacro[aid])),
        'uberon_syns':'\n'.join(insert_uberon)
    }
    return to_format.format(**kwargs)

#text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid])

#with open('aba_uberon_syn_review.txt', 'wt') as f:
    #f.write(text)

print('total uberon terms checked:', len(uberon_labs))
print('total aba terms:           ', len(abalabs))
print('total uberon with aba xref:', len([a for a in u_a_map.values() if a]))

ubridge = createOntology('uberon-parcellation-mappings', 'Uberon Parcellation Mappings',
                         makePrefixes('owl', 'ilx', 'UBERON', 'MBA'))
for u, arefs in u_a_map.items():
    if arefs:
        # TODO check for bad assumptions here
        ubridge.add_trip(u, 'ilx:delineatedBy', arefs[0])
        ubridge.add_trip(arefs[0], 'ilx:delineates', u)

ubridge.write()
embed()