def rdf_file_to_rdflib(rdf_filename): """Read an XML-formatted RDF file and parse into an RDFlib graph object. """ rg = RDFGraph() rg.parse(rdf_filename, format='xml') return rg
def read_graph(in_files, file_format=None): rdf_graph = RDFGraph() for in_file in in_files: rdf_graph.parse(in_file, format=file_format if file_format else rdflib.util.guess_format(in_file)) return rdf_graph
def get_ontology_namespaces(): ontology_namespaces = settings.ONTOLOGY_NAMESPACES g = RDFGraph() for ontology in models.Ontology.objects.all(): g.parse(ontology.path.path) for namespace in g.namespaces(): if str(namespace[1]) not in ontology_namespaces: ontology_namespaces[str(namespace[1])] = str(namespace[0]) return ontology_namespaces
def graph_to_rdf( graph: MultiDiGraph, path: Union[Path, str], verbose: bool = True ) -> Path: """Returns the path to the RDF file where the graph will be saved. Parameters ---------- graph : MultiDiGraph Graph to save. path : Union[Path, str] The path to the file where the graph will be saved. verbose : bool If true, a progress bar will be displayed. Examples -------- >>> import cfpq_data >>> g = cfpq_data.graph_from_dataset("generations", verbose=False) >>> path = cfpq_data.graph_to_rdf(g, "test.xml", verbose=False) Returns ------- path : Path Path to the RDF file where the graph will be saved. """ tmp = RDFGraph() for u, v, edge_labels in tqdm( graph.edges(data=True), disable=not verbose, desc="Generation..." ): subj = BNode(u) if isinstance(u, (BNode, URIRef, Literal)): subj = u obj = BNode(v) if isinstance(v, (BNode, URIRef, Literal)): obj = v for label in edge_labels.values(): pred = Literal(f"{label}", datatype=XSD.string) tmp.add((subj, pred, obj)) path = Path(path).resolve() tmp.serialize(destination=str(path), format="xml") return path
def prepare_export(self, namespaces, nodes): """ return a graph with the desired node type for writing out to XML, with cleaned-up namespaces """ output_graph = RDFGraph() [output_graph.bind(k, v) for k, v in namespaces.items()] [ output_graph.parse(data=etree.tostring(node), nsmap=namespaces) for node in nodes ] return output_graph
def graph_from_rdf(source: Union[Path, str], verbose: bool = True) -> MultiDiGraph: """Returns a graph from RDF file. Parameters ---------- source : Union[Path, str] The path to the RDF file with which the graph will be created. verbose : bool If true, a progress bar will be displayed. Examples -------- >>> import cfpq_data >>> generations = cfpq_data.graph_from_dataset("generations", verbose=False) >>> path = cfpq_data.graph_to_rdf(generations, "test.xml", verbose=False) >>> g = cfpq_data.graph_from_rdf(path, verbose=False) >>> g.number_of_nodes() 129 >>> g.number_of_edges() 273 Returns ------- g : MultiDiGraph Loaded graph. """ tmp = RDFGraph() tmp.parse(str(source), format="xml") g = MultiDiGraph() for subj, pred, obj in tqdm(tmp, disable=not verbose, desc="Loading..."): g.add_edge(subj, obj, label=pred) return g
def sparql_query(query, base_rdf_graph=None, sparql_endpoint="http://localhost:3030/ds/query", return_new_node_uris=False, s_default=None, p_default=None, o_default=None): def to_rdflib_term(term_dict): term_type = term_dict[u'type'] if term_type == u'uri': return rdflib.URIRef(term_dict[u'value']) elif term_type == u'literal': if u'xml:lang' in term_dict: return rdflib.Literal(term_dict[u'value'], lang=term_dict[u'xml:lang']) else: return rdflib.Literal(term_dict[u'value']) elif term_type == u'typed-literal': return rdflib.Literal(term_dict[u'value'], datatype=term_dict[u'datatype']) elif term_type == u'bnode': return rdflib.BNode(term_dict[u'value']) else: print "RDF term of unknown type:", term_dict exit(1) sparql = SPARQLWrapper(sparql_endpoint) sparql.setQuery(query) sparql.setReturnFormat(JSON) rdf_data = sparql.query().convert() if base_rdf_graph: rdf_graph = base_rdf_graph else: rdf_graph = RDFGraph() if return_new_node_uris: new_node_uris = set() for triple in rdf_data[u"results"][u"bindings"]: if u's' in triple: s = to_rdflib_term(triple[u's']) else: s = rdflib.URIRef(s_default) if u'p' in triple: p = to_rdflib_term(triple[u'p']) else: p = rdflib.URIRef(p_default) if u'o' in triple: o = to_rdflib_term(triple[u'o']) else: o = rdflib.URIRef(o_default) rdf_graph.add((s, p, o)) if return_new_node_uris: if u's' in triple: new_node_uris.add(triple[u's'][u'value']) if u'o' in triple: if triple[u'o'][u'type'] in [u'uri', u'bnode']: new_node_uris.add(triple[u'o'][u'value']) if return_new_node_uris: return 0, rdf_graph, new_node_uris else: return 0, rdf_graph
def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str, Path]): g = RDFGraph() km_dev = Namespace("http://isi.edu/integration/karma/dev#") g.namespace_manager.bind("km-dev", km_dev) kr2rml = BNode() g.add((kr2rml, RDF.type, km_dev.R2RMLMapping)) g.add((kr2rml, km_dev.sourceName, Literal(tbl.id))) # timestamp and version, doesn't need to be precise g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264))) g.add((kr2rml, km_dev.modelVersion, Literal("1.7"))) input_columns = [] output_columns = [] # mapping from Schema attribute path OR Command to KarmaColumns attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd], List[Dict[str, str]]] = {} for attr_path in tbl.schema.get_attr_paths(): input_columns.append([{ "columnName": x } for x in attr_path.split(Schema.PATH_DELIMITER)]) if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE: # default karma behaviour, you cannot set semantic type for higher level, but only "values" input_columns[-1].append({"columnName": "values"}) output_columns.append(input_columns[-1]) attr2hnodes[attr_path] = input_columns[-1] for cmd in self.commands: if isinstance(cmd, PyTransformNewColumnCmd): new_attr_path = cmd.input_attr_paths[0].split( Schema.PATH_DELIMITER)[:-1] new_attr_path.append(cmd.new_attr_name) new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path) # when you create a new column from a list, karma convert to a list of objects # e.g: birth_death_date.values, create col death date from that, # Karma create => birth_death_date.death_date # that's why we have this code below new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1] new_hnode.append({"columnName": cmd.new_attr_name}) output_columns.append(new_hnode) attr2hnodes[cmd] = output_columns[-1] attr2hnodes[new_attr_path] = output_columns[-1] worksheet_history = [] # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1) commands = [ cmd for cmd in self.commands if isinstance(cmd, PyTransformNewColumnCmd) ] for cmd in sorted( [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)], key=lambda c: c.node_id): commands.append(cmd) for cmd in sorted( [c for c in self.commands if isinstance(c, SetInternalLinkCmd)], key=lambda c: c.target_uri or c.source_uri or ""): commands.append(cmd) # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma # need to re-arrange the node_id node_id_old2new: Dict[str, str] = {} node_id_domain_count: Dict[str, int] = {} for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pass elif isinstance(cmd, SetSemanticTypeCmd): if cmd.node_id not in node_id_old2new: node_id_domain_count[ cmd.domain] = node_id_domain_count.get(cmd.domain, 0) + 1 node_id_old2new[ cmd. node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}" elif isinstance(cmd, SetInternalLinkCmd): if cmd.source_id not in node_id_old2new: assert cmd.source_uri is not None node_id_domain_count[ cmd.source_uri] = node_id_domain_count.get( cmd.source_uri, 0) + 1 node_id_old2new[ cmd. source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}" if cmd.target_id not in node_id_old2new: assert cmd.target_uri is not None node_id_domain_count[ cmd.target_uri] = node_id_domain_count.get( cmd.target_uri, 0) + 1 node_id_old2new[ cmd. target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}" for cmd in commands: if isinstance(cmd, PyTransformNewColumnCmd): pytransform_code = cmd.code # recover pytransform_code from our code pytransform_code = pytransform_code.replace( "__return__ = ", "return ") for match in reversed( list( re.finditer("getValue\(([^)]+)\)", pytransform_code))): start, end = match.span(1) field = pytransform_code[start:end].replace( "'", "").replace('"""', "").replace('"', '') # convert full name to last column name since Karma use last column name instead for input_attr_path in cmd.input_attr_paths: if input_attr_path == field: # TODO: will Karma always use last column name? field = attr2hnodes[input_attr_path][-1][ 'columnName'] break else: assert False, f"Cannot find any field {field} in the input columns" pytransform_code = pytransform_code[: start] + f'"{field}"' + pytransform_code[ end:] worksheet_history.append({ "tags": ["Transformation"], "commandName": "SubmitPythonTransformationCommand", "inputParameters": [{ "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_paths[0]], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "newColumnName", "value": cmd.new_attr_name, "type": "other" }, { "name": "transformationCode", "value": pytransform_code, "type": "other" }, { "name": "errorDefaultValue", "value": cmd.default_error_value, "type": "other" }, { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[iap] } for iap in cmd.input_attr_paths]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd] if attr2hnodes[cmd][-1]['columnName'] != "values" else attr2hnodes[cmd][:-1] }]) }] }) elif isinstance(cmd, SetSemanticTypeCmd): if cmd.type != "karma:classLink": worksheet_history.append({ "commandName": "SetSemanticTypeCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) else: worksheet_history.append({ "commandName": "SetMetaPropertyCommand", "tags": ["Modeling"], "inputParameters": [ { "name": "hNodeId", "value": attr2hnodes[cmd.input_attr_path], "type": "hNodeId" }, { "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "selectionName", "value": "DEFAULT_TEST", "type": "other" }, { "name": "metaPropertyName", "value": "isUriOfClass", "type": "other" }, { "name": "metaPropertyUri", "value": ont.full_uri(cmd.domain), "type": "other" }, { "name": "metaPropertyId", "value": ont.full_uri(node_id_old2new[cmd.node_id]), "type": "other" }, { "name": "SemanticTypesArray", "type": "other", "value": [{ "FullType": ont.full_uri(cmd.type), "isPrimary": True, "DomainLabel": ont.simplify_uri( node_id_old2new[cmd.node_id]), "DomainId": ont.full_uri(node_id_old2new[cmd.node_id]), "DomainUri": ont.full_uri(cmd.domain) }] }, { "name": "trainAndShowUpdates", "value": False, "type": "other" }, { "name": "rdfLiteralType", "value": "", "type": "other" }, # TODO: update correct RDF-Literal-Type { "name": "inputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) }, { "name": "outputColumns", "type": "hNodeIdList", "value": ujson.dumps([{ "value": attr2hnodes[cmd.input_attr_path] }]) } ] }) elif isinstance(cmd, SetInternalLinkCmd): # TODO: comment out because old KARMA doesn't recognize this! # if cmd.target_uri is not None or cmd.source_uri is not None: # worksheet_history.append({ # "commandName": "AddLinkCommand", # "tags": ["Modeling"], # "inputParameters": [ # {"name": "worksheetId", "value": "W", "type": "worksheetId"}, # { # "name": "edge", # "type": "other", # "value": { # "edgeId": ont.full_uri(cmd.link_lbl), # "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), # "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]), # "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), # "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1]) # } # }, # {"name": "inputColumns", "type": "hNodeIdList", "value": []}, # {"name": "outputColumns", "type": "hNodeIdList", "value": []} # ] # }) # else: worksheet_history.append({ "commandName": "ChangeInternalNodeLinksCommand", "tags": ["Modeling"], "inputParameters": [{ "name": "worksheetId", "value": "W", "type": "worksheetId" }, { "name": "initialEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]) }] }, { "name": "newEdges", "type": "other", "value": [{ "edgeId": ont.full_uri(cmd.link_lbl), "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]), "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]), "edgeTargetUri": ont.full_uri( cmd.target_uri or node_id_old2new[cmd.target_id][:-1]), "edgeSourceUri": ont.full_uri( cmd.source_uri or node_id_old2new[cmd.source_id][:-1]) }] }, { "name": "inputColumns", "type": "hNodeIdList", "value": [] }, { "name": "outputColumns", "type": "hNodeIdList", "value": [] }] }) g.add((kr2rml, km_dev.hasInputColumns, Literal(ujson.dumps(input_columns)))) g.add((kr2rml, km_dev.hasOutputColumns, Literal(ujson.dumps(output_columns)))) g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id))) g.add((kr2rml, km_dev.hasBaseURI, Literal("http://localhost:8080/source/"))) g.add((kr2rml, km_dev.hasWorksheetHistory, Literal(ujson.dumps(worksheet_history, indent=4)))) g.serialize(str(fpath), format='n3')
def write_skos(self, directory): # parse the original v3 graph v3graph = RDFGraph() v3graph.parse(data=self.v3_skos) # create the namespace manager namespaces = ( ("arches", ARCHES), ("skos", SKOS), ("dcterms", DCTERMS) ) nsmanager = NamespaceManager(RDFGraph()) for ns in namespaces: nsmanager.bind(ns[0], ns[1]) # create the output graphs with the new namespace manager v4thesaurus = RDFGraph(namespace_manager=nsmanager) v4collections = RDFGraph(namespace_manager=nsmanager) # add the concept schemes to the thesaurus concept_schemes = [i for i in v3graph.triples((None, RDF.type, SKOS['ConceptScheme']))] for cs in concept_schemes: v4thesaurus.add(cs) # iterate the concepts and make collections for them. topconcepts = [i for i in v3graph.triples((None, SKOS['hasTopConcept'], None))] for tc in topconcepts: # get the top concept name and if convert it to a Literal object tc_name_literal = v3graph.value(subject=tc[2], predicate=SKOS['prefLabel']) # get the value from the JSON formatted Literal content # if the Literal content is NOT JSON, then this reference data was # exported from v3 with the wrong command and will not work. try: tc_name = json.loads(tc_name_literal.value)['value'] collection_id = self.new_or_existing_uuid(tc_name) except ValueError: docs = "https://arches.readthedocs.io/en/stable/v3-to-v4-migration/" print("ERROR: Incompatible SKOS. See {} for more information.".format(docs)) exit() if self.verbose: children = [i for i in v3graph.triples((tc[2], SKOS['narrower'], None))] print("{}: {} immediate child concepts".format(tc_name, len(children))) print(" collection uuid: "+collection_id) # create a new collection for each top concept v4thesaurus.add(tc) v4collections.add((ARCHES[collection_id], RDF.type, SKOS['Collection'])) # add the preflabel for the collection, if it's not the r2r types collection # which already has a label in Arches by default. if tc_name != "Resource To Resource Relationship Types": simple_tc_name = Literal(tc_name, lang="en-US") v4collections.add((ARCHES[collection_id], SKOS['prefLabel'], simple_tc_name)) # recursively add all of the concept children to the collection for this # top concept. v4collections = self.add_children_to_collection(v3graph, v4collections, collection_id, tc[2]) # add ALL concepts from the v3 graph to the thesaurus. this pulls along all # child/parent relationships into the thesaurus, as well as all extra info # for each concept, like sortorder, prefLabel, etc. for concept in v3graph.triples((None, RDF.type, SKOS['Concept'])): v4thesaurus.add(concept) # this is the extra info related to each concept, like prefLabel, sortorder, etc. for s, p, o in v3graph.triples((concept[0], None, None)): # skip the label of the resource to resource relationship type concept # as it's already in Arches and this would duplicate it. if s.endswith("000004") and p == SKOS['prefLabel']: continue v4thesaurus.add((s, p, o)) # export the thesaurus and collections to predetermined locations within the # package file structure. thesaurus_file = os.path.join(directory, 'concepts', 'thesaurus.xml') if self.verbose: print("writing thesaurus to: "+thesaurus_file) v4thesaurus.serialize(destination=thesaurus_file, format="pretty-xml") collections_file = os.path.join(directory, 'collections', 'collections.xml') if self.verbose: print("writing collections to: "+collections_file) v4collections.serialize(destination=collections_file, format="pretty-xml")
def parse(self, uri, fmt="ttl"): g = RDFGraph().parse(uri, format=fmt) for s, p, o in g.triples((None, None, None)): self.append((s, p, o))