Beispiel #1
0
 def fixit(current_object, prefix_dictionary):
     """
     Read the def data structure and replace all string URIs with URIRef entities
     :param current_object: the piece of the data structure to be fixed
     :return current_object: the piece repaired in place
     """
     from rdflib import URIRef
     if isinstance(current_object, dict):
         for k in current_object.keys():
             current_object[k] = fixit(current_object[k], prefix_dictionary)
     elif isinstance(current_object, list):
         for i in range(0, len(current_object)):
             current_object[i] = fixit(current_object[i], prefix_dictionary)
     elif isinstance(current_object, basestring):
         if current_object.startswith("http://"):
             current_object = URIRef(current_object)
         elif current_object.startswith("xsd:"):
             current_object = cast_to_rdflib(current_object)
         elif ':' in current_object:
             k = current_object.find(':')
             tag = str(current_object[0:k + 1])
             if tag in prefix_dictionary:
                 current_object = URIRef(
                     str(current_object).replace(tag,
                                                 prefix_dictionary[tag]))
     return current_object
Beispiel #2
0
 def relativize(self, uri):
     base = URIRef(self.base)
     basedir = URIRef(self.base if base.endswith('/') else base.rsplit('/', 1)[0])
     if base is not None:
         if uri == base:
             uri = URIRef('')
         elif uri == basedir:
             uri = URIRef('.')
         elif uri.startswith(basedir + '/'):
             uri = URIRef(uri.replace(basedir + '/', "", 1))
     return uri
Beispiel #3
0
def make_curie(uri: URIRef) -> str:
    HTTP = 'http'
    HTTPS = 'https'

    curie = contract(uri)

    if curie is not None:
        return curie

    if uri.startswith(HTTPS):
        uri = HTTP + uri[len(HTTPS):]
    elif uri.startswith(HTTP):
        uri = HTTPS + uri[len(HTTP):]

    curie = contract(uri)

    if curie is None:
        return uri
    else:
        return curie
 def relativize(self, uri):
     base = URIRef(self.base)
     basedir = URIRef(
         self.base if base.endswith('/') else base.rsplit('/', 1)[0])
     if base is not None:
         if uri == base:
             uri = URIRef('')
         elif uri == basedir:
             uri = URIRef('.')
         elif uri.startswith(basedir + '/'):
             uri = URIRef(uri.replace(basedir + '/', "", 1))
     return uri
 def convert(self, name, qname, attrs):
     if name[0] is None:
         name = URIRef(name[1])
     else:
         name = URIRef("".join(name))
     atts = {}
     for (n, v) in attrs.items():  #attrs._attrs.iteritems(): #
         if n[0] is None:
             att = URIRef(n[1])
         else:
             att = URIRef("".join(n))
         if att.startswith(XMLNS) or att[0:3].lower() == "xml":
             pass
         elif att in UNQUALIFIED:
             #if not RDFNS[att] in atts:
             atts[RDFNS[att]] = v
         else:
             atts[URIRef(att)] = v
     return name, atts
 def convert(self, name, qname, attrs):
     if name[0] is None:
         name = URIRef(name[1])
     else:
         name = URIRef("".join(name))
     atts = {}
     for (n, v) in attrs.items(): #attrs._attrs.iteritems(): #
         if n[0] is None:
             att = URIRef(n[1])
         else:
             att = URIRef("".join(n))
         if att.startswith(XMLNS) or att[0:3].lower()=="xml":
             pass
         elif att in UNQUALIFIED:
             #if not RDFNS[att] in atts:
             atts[RDFNS[att]] = v
         else:
             atts[URIRef(att)] = v
     return name, atts
Beispiel #7
0
 def fixit(current_object):
     """
     Read the def data structure and replace all string URIs with URIRef entities
     :param current_object: the piece of the data structure to be fixed
     :return current_object: the piece repaired in place
     """
     from rdflib import URIRef
     if isinstance(current_object, dict):
         for k in current_object.keys():
             current_object[k] = fixit(current_object[k])
     elif isinstance(current_object, list):
         for i in range(0, len(current_object)):
             current_object[i] = fixit(current_object[i])
     elif isinstance(current_object, basestring):
         if current_object.startswith("http://"):
             current_object = URIRef(current_object)
         elif current_object.startswith("xsd:"):
             current_object = cast_to_rdflib(current_object)
     return current_object
Beispiel #8
0
 def fixit(current_object):
     """
     Read the def data structure and replace all string URIs with URIRef entities
     :param current_object: the piece of the data structure to be fixed
     :return current_object: the piece repaired in place
     """
     from rdflib import URIRef
     if isinstance(current_object, dict):
         for k in current_object.keys():
             current_object[k] = fixit(current_object[k])
     elif isinstance(current_object, list):
         for i in range(0, len(current_object)):
             current_object[i] = fixit(current_object[i])
     elif isinstance(current_object, basestring):
         if current_object.startswith("http://"):
             current_object = URIRef(current_object)
         elif current_object.startswith("xsd:"):
             current_object = cast_to_rdflib(current_object)
     return current_object
Beispiel #9
0
 def fixit(current_object, prefix_dictionary):
     """
     Read the def data structure and replace all string URIs with URIRef entities
     :param current_object: the piece of the data structure to be fixed
     :return current_object: the piece repaired in place
     """
     from rdflib import URIRef
     if isinstance(current_object, dict):
         for k in current_object.keys():
             current_object[k] = fixit(current_object[k], prefix_dictionary)
     elif isinstance(current_object, list):
         for i in range(0, len(current_object)):
             current_object[i] = fixit(current_object[i], prefix_dictionary)
     elif isinstance(current_object, basestring):
         if current_object.startswith("http://"):
             current_object = URIRef(current_object)
         elif current_object.startswith("xsd:"):
             current_object = cast_to_rdflib(current_object)
         elif ':' in current_object:
             k = current_object.find(':')
             tag = str(current_object[0:k + 1])
             if tag in prefix_dictionary:
                 current_object = URIRef(str(current_object).replace(tag, prefix_dictionary[tag]))
     return current_object
def turn_into_mp(row, dataset):
    # Claim
    claim_subj = FOODHKG_INST[get_hash(row['Claim'])]
    pred = RDF['type']
    obj = URIRef('http://purl.org/mp/Claim')
    dataset.add((claim_subj, pred, obj))
    # define the claim label
    dataset.add((claim_subj, RDFS['label'], Literal(row['Claim'])))
    opinion_subj = FOODHKG_INST[get_hash(row['EFSA Opinion Reference'])]
    dataset.add((opinion_subj, FOODHKG_PROPS['advises'], claim_subj))
    dataset.add((opinion_subj, RDF['type'], FOODHKG_CLS['Opinion']))

    # micropublication (a MP conssists of claim, statement, representation of fine-grained NP)
    # each ESFA opinion is a MP
    mp_subj = FOODHKG_INST[get_hash(row['EFSA Opinion Reference'] +
                                    row['Claim'])]
    dataset.add((mp_subj, RDF['type'], MP['Micropublication']))
    # each MP argues a claim
    dataset.add((mp_subj, MP['argues'], claim_subj))

    # to define fine-granular facts (triples facts) using Nanopublication (NP)
    hr_subj = FOODHKG_INST[get_hash(row['Health relationship'] +
                                    row['Phenotype'] + row['Food'])]
    dataset.add((mp_subj, MP['represents'], hr_subj))
    # np_subj = FOODHKG_INST[get_hash(row['Health relationship']+row['EFSA Opinion Reference'])]
    # dataset.add((np_subj, RDF['type'],  NP['Nanopublication']))
    # Assertions for NP
    # dataset.add((np_subj, NP['hasAssertion'],  hr_subj))
    dataset.add((hr_subj, RDFS['label'], Literal(row['Health relationship'])))
    # sub type of food health effect/categorization
    hr_type = FOODHKG_INST[get_hash(row['Health relationship'])]
    dataset.add((hr_subj, RDF['type'], hr_type))
    dataset.add((hr_type, RDF['type'], FOODHKG_CLS['FoodHealthEffect']))
    dataset.add((hr_type, RDFS['label'], Literal(row['Health relationship'])))

    if str(row['Phenotype Ontology Term']) == 'nan':
        pheno_uri = FOODHKG_INST[get_hash(row['Phenotype'])]
        dataset = createPhenotypeRelTriples(dataset, hr_subj, pheno_uri,
                                            row['Phenotype'])
    else:
        for pheno_uri in row['Phenotype Ontology Term'].split(';'):
            pheno_uri = pheno_uri.strip()
            if pheno_uri == '':
                continue
            # pheno_uri = normalize(pheno_uri)
            pheno_uri = URIRef(pheno_uri)
            dataset = createPhenotypeRelTriples(dataset, hr_subj, pheno_uri,
                                                row['Phenotype'])

    if str(row['Food Ontology Term']) == 'nan':
        fooduri = FOODHKG_INST[get_hash(row['Food'])]
        dataset = createFoodRelTriples(dataset, hr_subj, fooduri,
                                       row['Food Type'], row['Food'])
    else:
        for fooduri in row['Food Ontology Term'].split(';'):
            fooduri = fooduri.strip()
            if fooduri == '':
                continue
            fooduri = URIRef(fooduri)
            dataset = createFoodRelTriples(dataset, hr_subj, fooduri,
                                           row['Food Type'], row['Food'])

    if row['Target population'] != '':
        if row['Target population ontology term'] != '':
            if row['Target population ontology term'] == 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C18241':
                targetPopUri = URIRef(
                    'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C18241'
                )
            else:
                targetPopUri = FOODHKG_INST[get_hash(
                    row['Target population ontology term'])]
                tp_text = row['Target population ontology term'].split('\n')
                for tp in tp_text:
                    tp_tuple = tp.split(': ')
                    if len(tp_tuple) != 2:
                        continue
                    # print(tp_tuple)
                    pred = tp_tuple[0]
                    obj = tp_tuple[1].strip()
                    if not obj.startswith('http'):
                        dataset.add((targetPopUri, PICO[pred], Literal(obj)))
                    else:
                        dataset.add(
                            (targetPopUri, FOODHKG_PROPS[pred], URIRef(obj)))
        dataset.add(
            (targetPopUri, RDFS['label'], Literal(row['Target population'])))
        dataset.add(
            (hr_subj, FOODHKG_PROPS['hasTargetPopulation'], targetPopUri))

    for i in range(1, 9):
        if str(row[f'Supporting Evidence Text {i}']) == 'nan':
            continue
        supp_subj = FOODHKG_INST[get_hash(
            row[f'Supporting Evidence Text {i}'])]

        pred = MP['supports']
        # statemtent supports the claim
        dataset.add((supp_subj, pred, claim_subj))
        # is type of Statement
        dataset.add((supp_subj, RDF['type'], MP['Statement']))
        # label of Statement
        dataset.add((supp_subj, RDFS['label'],
                     Literal(row[f'Supporting Evidence Text {i}'])))

        suppRef = row[f'Supporting Evidence Reference {i}']
        # if references for statement exist
        if str(suppRef) != 'nan' and len(
                suppRef) > 3 and suppRef.lower() != "no reference":
            refs = suppRef.split(';')
            for ref in refs:
                # print(ref)
                ref_tuple = ref.split(':')
                suppref_text = ref_tuple[0]
                suppref_doi = ref_tuple[1]
                suppref_subj = FOODHKG_INST[get_hash(suppref_text)]
                dataset = createSupportingRefTriples(dataset, supp_subj,
                                                     suppref_subj, suppref_doi,
                                                     ref)

    return dataset
Beispiel #11
0
def main(args):
    # load graph
    g = rdflib.Graph()
    g.parse(args.input, publicID=URI_TMP, format="xml")

    # Tripleを含まないgraphをファイルから作成し、そこにTripleを追加していく
    g2 = rdflib.Graph()
    g2.parse(args.header, publicID=URI_TMP, format="xml")

    # bifd.owl
    g3 = rdflib.Graph()
    g3.parse(args.bifd, publicID=URI_TMP, format="xml")

    convert_uris = load_dict(args.subject)
    convert_ps = load_dict(args.predicate)
    convert_ps["https://wba-initiative.org/bifd/label"] = str(RDFS.label)

    # 処理対象のクラスの抽出 このうちのs.tsvに記載のあるものしか最終出力に含めない
    query_class = g.query(
        """SELECT ?class
        WHERE {
        ?class rdf:type owl:Class.
        }
        """)

    keep_s = set()

    for c in query_class:
        keep_s.add(c[0])

    query_references = g.query(
        """SELECT ?uri ?p ?v 
        WHERE {
        ?uri rdf:type swivt:Subject.
        ?uri ?p ?v.
        filter (?p in (property:BibTex-3Ahas_doi, URI("https://wba-initiative.org/noprefix/URLhas"), rdfs:label))
        filter (strstarts(str(?uri), "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/-2A"))
        } 
        """)

    references = {}
    references_val = {}
    references_s_o = {}

    for x in query_references:
        p = str(x[1])
        if x[0] not in references:
            references[x[0]] = [p]
        else:
            references[x[0]].append(p)
        references_val["{}\t{}".format(str(x[0]), str(x[1]))] = str(x[2])
    for k in references.keys():
        predicates = []
        for p in references[k]:
            predicates.append(p)
        if "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3ABibTex-3Ahas_doi" in predicates:
            o = references_val["{}\t{}".format(str(k), "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3ABibTex-3Ahas_doi")]
            references_s_o[str(k.split("/")[-1])] = o
            continue
        if "https://wba-initiative.org/noprefix/URLhas" in predicates:
            o = references_val["{}\t{}".format(str(k), "https://wba-initiative.org/noprefix/URLhas")]
            references_s_o[str(k.split("/")[-1])] = o
            continue
        if str(RDFS.label) in predicates:
            o = references_val["{}\t{}".format(str(k), str(RDFS.label))]
            references_s_o[str(k.split("/")[-1])] = o
            continue
        if True:
            print("Error: no info for references provided.")
            exit(1)
    obo_id_dict = {}

    for s, p, o in g:
        if s not in keep_s:
            continue
        if str(s) in convert_uris.keys():
            s = URIRef(convert_uris[str(s)])
        if str(o) in convert_uris.keys():
            o = URIRef(convert_uris[str(o)])
        if str(p) in convert_ps.keys():
            p = URIRef(convert_ps[str(p)])
        if str(p) == "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3AOBO_ID":
            obo_id_dict[str(s)] = str(o)
        if str(s) in convert_uris.values() and (p == RDFS.subClassOf or p == RDFS.label or str(p).startswith(BIFD_PREFIX) or o == OWL.Class):
            g2.add((s, p, o))

    for s, p, o in g:
        if s not in keep_s:
            continue
        if str(s) in convert_uris.keys():
            s = URIRef(convert_uris[str(s)])
        if str(p) == "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3AOBO_ID":
            if str(s) in obo_id_dict.keys():
                reg = re.compile(r'^[a-zA-Z_][\w.-]*$')
                if reg.match(obo_id_dict[str(s)]): # check if it results in a valid uri
                    if str(s) in convert_uris.values():
                        g2.add((s, OWL.sameAs, URIRef("http://purl.obolibrary.org/obo/{}".format(obo_id_dict[str(s)]))))

    query_object_property = g3.query(
        """SELECT ?op
        WHERE {
        ?op rdf:type owl:ObjectProperty.
        }""")

    object_properties = set()
    for res in query_object_property:
        p = str(res[0]).strip("/")
        object_properties.add(p)
    for s, p, o in g2:
        if str(p) == 'https://wba-initiative.org/bifd/reference':
            k = o.replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/", '')
            if k in references_s_o.keys():
                g2.add((s, p, Literal(references_s_o[k], datatype=XSD.string)))
            g2.remove((s, p, o))

        if str(p) == 'https://wba-initiative.org/bifd/taxon':
            g2.add((s, p, Literal("http://purl.obolibrary.org/obo/{}".format(obo_id_dict[str(o)]), datatype=XSD.string)))

        if str(p) in convert_ps.values() and p != RDFS.label and str(p) in object_properties:  # プロパティの制約条件の変換
            if str(p) == "https://wba-initiative.org/bifd/transmitter" or str(p) == "https://wba-initiative.org/bifd/modType":
                continue
            g2.remove((s, p, o))
            blank_node = BNode()
            g2.add((s, RDFS.subClassOf, blank_node))
            g2.add((blank_node, RDF.type, OWL.Restriction))
            g2.add((blank_node, OWL.onProperty, p))
            g2.add((blank_node, OWL.someValuesFrom, o))

    for s, p, o in g2:
        if o.startswith("http://183.181.89.140/mediawiki/index.php/Special:URIResolver"):
            g2.remove((s, p, o))
            # s.tsvに含まれる変換対象のURIではない、oのURIの変換を正規表現ベースでやる
            o = URIRef(o.replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Category-3ABIF-3A",
                             "https://wba-initiative.org/bifd/") \
                   .replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Category-3A",
                            "http://wba-initiative.org/wbra/")
            # Glutamateは特別扱い
            .replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Glutamate", "https://wba-initiative.org/bifd/Glutamate"))
            g2.add((s, p, o))

    g2.serialize(args.output, publicID=URI_TMP, format="pretty-xml")
Beispiel #12
0
 def _is_datatype(uri: rdflib.URIRef):
     if isinstance(uri, rdflib.BNode):
         return False
     return uri.startswith(str(XSD)) or uri.startswith(str(RDF))
Beispiel #13
0
def fileForUri(dirUriMap: DirUriMap, ctx: URIRef) -> bytes:
    assert isinstance(ctx, URIRef), ctx
    for d, prefix in dirUriMap.items():
        if ctx.startswith(prefix):
            return d + ctx[len(prefix):].encode('utf8') + b'.n3'
    raise ValueError("don't know what filename to use for %s" % ctx)
Beispiel #14
0
def retrieve_graph_from_dbpedia(term):
    assert ONLINE_ENABLED
    logger.info('online access - DBpedia: {term}'.format(term=unicode(term)))
    term_utf = term.encode('utf-8')
    term_url = quote_plus(term_utf, safe=str("/:#,()'"))
    #print '---'
    #print 'term_url', term_url
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    #query = """
    #    SELECT ?p ?o
    #    WHERE {{ <{term_url}> ?p ?o }}
    #""".format(term_url=term_url)
    query = """
        SELECT ?p ?o
        WHERE {{
            <{term_url}> ?p ?o
            FILTER( STRSTARTS(STR(?p), "{foaf}")
                || STRSTARTS(STR(?p), "{rdf}")
                || STRSTARTS(STR(?p), "{rdfs}")
                || STRSTARTS(STR(?p), "{dcterms}")
                || STRSTARTS(STR(?p), "{ontology}"))
            FILTER (isURI(?o) || langMatches(lang(?o), "EN"))
        }}
    """.format(term_url=term_url,
            foaf=unicode(FOAF),
            rdf=unicode(RDF),
            rdfs=unicode(RDFS),
            dcterms=unicode(DCTERMS),
            ontology=unicode(ONTOLOGY))

    sparql.setQuery(query.encode('utf-8'))
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query()
        # workaround for "Invalid \escape" error which can be raised by
        # convert()
        body = results.response.read()
        results = cjson.decode(body)
    except HTTPError as exc:
        # can occur if DBpedia is under maintenance (quite often)
        logger.error('Getting graph for {term} failed; {message}; {excType}'
            .format(term=term, message=exc.message, excType=unicode(type(exc))))
        return None

    # create graph and bind relevant namespaces
    graph = Graph()
    for prefix, namespace in NAMESPACES_DICT.items():
        graph.bind(prefix, namespace)

    LITERAL_MAX_LENGTH = 600
    for result in results["results"]["bindings"]:
        try:
            p = URIRef(result['p']['value'])
            # filter wikiPageRevisionID, wikiPageExternalLike etc.
            if p.startswith(ONTOLOGY['wiki']):
                continue
            if result['o']['type'] == 'uri':
                o = URIRef(result['o']['value'])
            else:
                o = Literal(result['o']['value'])
                # if object is too long (e.g. abstract, ignore it)
                if len(o) > LITERAL_MAX_LENGTH:
                    continue
            graph.add((term, p, o))
            #print type(p), p
            #print type(o), o
            #print '*'
        except KeyError:
            continue

    # check if the graph is not empty
    if not graph:
        logger.warning('Retrieved empty graph for ' + unicode(term))

    return graph
Beispiel #15
0
    dict_s = {}
    dict_p = {}
    dict_t = {}

    print("loading dictionaries")
    f = open(args.input, "rb")
    for line in f:
        if line:
            m = pattern.match(line)
            try:
                s, p, o = m.group(1), m.group(2), m.group(3)
                s, p, o = URIRef(s), URIRef(p), URIRef(o)

                if not args.nocat or p != dctSubject:
                    if s.startswith(DBR):
                        if s not in dict_s:
                            dict_s[s] = len(dict_s)
                    if o.startswith(DBR):
                        if o not in dict_s:
                            dict_s[o] = len(dict_s)

                    if (p == RDF.type or p == "a"
                        ) and not filter_entity(s) and not filter_entity(o):
                        if s not in dict_s:
                            dict_s[s] = len(dict_s)
                        if o not in dict_t:
                            dict_t[o] = len(dict_t)
                    elif p == RDFS.subClassOf:
                        if s not in dict_t:
                            dict_t[s] = len(dict_t)