Example #1
0
def rdf_file_to_rdflib(rdf_filename):
    """Read an XML-formatted RDF file and parse into an RDFlib graph
    object.
    """
    rg = RDFGraph()
    rg.parse(rdf_filename, format='xml')
    return rg
Example #2
0
def read_graph(in_files, file_format=None):
    rdf_graph = RDFGraph()

    for in_file in in_files:
        rdf_graph.parse(in_file,
                        format=file_format
                        if file_format else rdflib.util.guess_format(in_file))

    return rdf_graph
Example #3
0
def get_ontology_namespaces():
    ontology_namespaces = settings.ONTOLOGY_NAMESPACES
    g = RDFGraph()
    for ontology in models.Ontology.objects.all():
        g.parse(ontology.path.path)
    for namespace in g.namespaces():
        if str(namespace[1]) not in ontology_namespaces:
            ontology_namespaces[str(namespace[1])] = str(namespace[0])
    return ontology_namespaces
Example #4
0
def graph_to_rdf(
    graph: MultiDiGraph, path: Union[Path, str], verbose: bool = True
) -> Path:
    """Returns the path to the RDF file
    where the graph will be saved.

    Parameters
    ----------
    graph : MultiDiGraph
        Graph to save.

    path : Union[Path, str]
        The path to the file where the graph will be saved.

    verbose : bool
        If true, a progress bar will be displayed.

    Examples
    --------
    >>> import cfpq_data
    >>> g = cfpq_data.graph_from_dataset("generations", verbose=False)
    >>> path = cfpq_data.graph_to_rdf(g, "test.xml", verbose=False)

    Returns
    -------
    path : Path
        Path to the RDF file where the graph will be saved.
    """
    tmp = RDFGraph()

    for u, v, edge_labels in tqdm(
        graph.edges(data=True), disable=not verbose, desc="Generation..."
    ):
        subj = BNode(u)
        if isinstance(u, (BNode, URIRef, Literal)):
            subj = u

        obj = BNode(v)
        if isinstance(v, (BNode, URIRef, Literal)):
            obj = v

        for label in edge_labels.values():
            pred = Literal(f"{label}", datatype=XSD.string)
            tmp.add((subj, pred, obj))

    path = Path(path).resolve()
    tmp.serialize(destination=str(path), format="xml")

    return path
    def prepare_export(self, namespaces, nodes):
        """
        return a graph with the desired node type for writing out to XML,
        with cleaned-up namespaces
        """

        output_graph = RDFGraph()
        [output_graph.bind(k, v) for k, v in namespaces.items()]

        [
            output_graph.parse(data=etree.tostring(node), nsmap=namespaces)
            for node in nodes
        ]

        return output_graph
Example #6
0
def graph_from_rdf(source: Union[Path, str], verbose: bool = True) -> MultiDiGraph:
    """Returns a graph from RDF file.

    Parameters
    ----------
    source : Union[Path, str]
        The path to the RDF file with which
        the graph will be created.

    verbose : bool
        If true, a progress bar will be displayed.

    Examples
    --------
    >>> import cfpq_data
    >>> generations = cfpq_data.graph_from_dataset("generations", verbose=False)
    >>> path = cfpq_data.graph_to_rdf(generations, "test.xml", verbose=False)
    >>> g = cfpq_data.graph_from_rdf(path, verbose=False)
    >>> g.number_of_nodes()
    129
    >>> g.number_of_edges()
    273

    Returns
    -------
    g : MultiDiGraph
        Loaded graph.
    """
    tmp = RDFGraph()
    tmp.parse(str(source), format="xml")

    g = MultiDiGraph()

    for subj, pred, obj in tqdm(tmp, disable=not verbose, desc="Loading..."):
        g.add_edge(subj, obj, label=pred)

    return g
Example #7
0
def sparql_query(query,
                 base_rdf_graph=None,
                 sparql_endpoint="http://localhost:3030/ds/query",
                 return_new_node_uris=False,
                 s_default=None,
                 p_default=None,
                 o_default=None):
    def to_rdflib_term(term_dict):
        term_type = term_dict[u'type']
        if term_type == u'uri':
            return rdflib.URIRef(term_dict[u'value'])
        elif term_type == u'literal':
            if u'xml:lang' in term_dict:
                return rdflib.Literal(term_dict[u'value'],
                                      lang=term_dict[u'xml:lang'])
            else:
                return rdflib.Literal(term_dict[u'value'])
        elif term_type == u'typed-literal':
            return rdflib.Literal(term_dict[u'value'],
                                  datatype=term_dict[u'datatype'])
        elif term_type == u'bnode':
            return rdflib.BNode(term_dict[u'value'])
        else:
            print "RDF term of unknown type:", term_dict
            exit(1)

    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    rdf_data = sparql.query().convert()
    if base_rdf_graph:
        rdf_graph = base_rdf_graph
    else:
        rdf_graph = RDFGraph()

    if return_new_node_uris:
        new_node_uris = set()

    for triple in rdf_data[u"results"][u"bindings"]:
        if u's' in triple:
            s = to_rdflib_term(triple[u's'])
        else:
            s = rdflib.URIRef(s_default)

        if u'p' in triple:
            p = to_rdflib_term(triple[u'p'])
        else:
            p = rdflib.URIRef(p_default)

        if u'o' in triple:
            o = to_rdflib_term(triple[u'o'])
        else:
            o = rdflib.URIRef(o_default)

        rdf_graph.add((s, p, o))

        if return_new_node_uris:
            if u's' in triple:
                new_node_uris.add(triple[u's'][u'value'])
            if u'o' in triple:
                if triple[u'o'][u'type'] in [u'uri', u'bnode']:
                    new_node_uris.add(triple[u'o'][u'value'])

    if return_new_node_uris:
        return 0, rdf_graph, new_node_uris
    else:
        return 0, rdf_graph
Example #8
0
    def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str,
                                                                    Path]):
        g = RDFGraph()
        km_dev = Namespace("http://isi.edu/integration/karma/dev#")
        g.namespace_manager.bind("km-dev", km_dev)
        kr2rml = BNode()

        g.add((kr2rml, RDF.type, km_dev.R2RMLMapping))
        g.add((kr2rml, km_dev.sourceName, Literal(tbl.id)))
        # timestamp and version, doesn't need to be precise
        g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264)))
        g.add((kr2rml, km_dev.modelVersion, Literal("1.7")))

        input_columns = []
        output_columns = []
        # mapping from Schema attribute path OR Command to KarmaColumns
        attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd],
                          List[Dict[str, str]]] = {}

        for attr_path in tbl.schema.get_attr_paths():
            input_columns.append([{
                "columnName": x
            } for x in attr_path.split(Schema.PATH_DELIMITER)])
            if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE:
                # default karma behaviour, you cannot set semantic type for higher level, but only "values"
                input_columns[-1].append({"columnName": "values"})
            output_columns.append(input_columns[-1])
            attr2hnodes[attr_path] = input_columns[-1]

        for cmd in self.commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                new_attr_path = cmd.input_attr_paths[0].split(
                    Schema.PATH_DELIMITER)[:-1]
                new_attr_path.append(cmd.new_attr_name)
                new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path)

                # when you create a new column from a list, karma convert to a list of objects
                # e.g: birth_death_date.values, create col death date from that,
                # Karma create => birth_death_date.death_date
                # that's why we have this code below
                new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1]
                new_hnode.append({"columnName": cmd.new_attr_name})

                output_columns.append(new_hnode)
                attr2hnodes[cmd] = output_columns[-1]
                attr2hnodes[new_attr_path] = output_columns[-1]

        worksheet_history = []
        # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1)
        commands = [
            cmd for cmd in self.commands
            if isinstance(cmd, PyTransformNewColumnCmd)
        ]
        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)],
                key=lambda c: c.node_id):
            commands.append(cmd)

        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetInternalLinkCmd)],
                key=lambda c: c.target_uri or c.source_uri or ""):
            commands.append(cmd)

        # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma
        # need to re-arrange the node_id
        node_id_old2new: Dict[str, str] = {}
        node_id_domain_count: Dict[str, int] = {}

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pass
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.node_id not in node_id_old2new:
                    node_id_domain_count[
                        cmd.domain] = node_id_domain_count.get(cmd.domain,
                                                               0) + 1
                    node_id_old2new[
                        cmd.
                        node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}"
            elif isinstance(cmd, SetInternalLinkCmd):
                if cmd.source_id not in node_id_old2new:
                    assert cmd.source_uri is not None
                    node_id_domain_count[
                        cmd.source_uri] = node_id_domain_count.get(
                            cmd.source_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}"
                if cmd.target_id not in node_id_old2new:
                    assert cmd.target_uri is not None
                    node_id_domain_count[
                        cmd.target_uri] = node_id_domain_count.get(
                            cmd.target_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}"

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pytransform_code = cmd.code
                # recover pytransform_code from our code
                pytransform_code = pytransform_code.replace(
                    "__return__ = ", "return ")
                for match in reversed(
                        list(
                            re.finditer("getValue\(([^)]+)\)",
                                        pytransform_code))):
                    start, end = match.span(1)
                    field = pytransform_code[start:end].replace(
                        "'", "").replace('"""', "").replace('"', '')
                    # convert full name to last column name since Karma use last column name instead
                    for input_attr_path in cmd.input_attr_paths:
                        if input_attr_path == field:
                            # TODO: will Karma always use last column name?
                            field = attr2hnodes[input_attr_path][-1][
                                'columnName']
                            break
                    else:
                        assert False, f"Cannot find any field {field} in the input columns"
                    pytransform_code = pytransform_code[:
                                                        start] + f'"{field}"' + pytransform_code[
                                                            end:]

                worksheet_history.append({
                    "tags": ["Transformation"],
                    "commandName":
                    "SubmitPythonTransformationCommand",
                    "inputParameters": [{
                        "name":
                        "hNodeId",
                        "value":
                        attr2hnodes[cmd.input_attr_paths[0]],
                        "type":
                        "hNodeId"
                    }, {
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name": "selectionName",
                        "value": "DEFAULT_TEST",
                        "type": "other"
                    }, {
                        "name": "newColumnName",
                        "value": cmd.new_attr_name,
                        "type": "other"
                    }, {
                        "name": "transformationCode",
                        "value": pytransform_code,
                        "type": "other"
                    }, {
                        "name": "errorDefaultValue",
                        "value": cmd.default_error_value,
                        "type": "other"
                    }, {
                        "name":
                        "inputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value": attr2hnodes[iap]
                        } for iap in cmd.input_attr_paths])
                    }, {
                        "name":
                        "outputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value":
                            attr2hnodes[cmd]
                            if attr2hnodes[cmd][-1]['columnName'] != "values"
                            else attr2hnodes[cmd][:-1]
                        }])
                    }]
                })
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.type != "karma:classLink":
                    worksheet_history.append({
                        "commandName":
                        "SetSemanticTypeCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
                else:
                    worksheet_history.append({
                        "commandName":
                        "SetMetaPropertyCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyName",
                                "value": "isUriOfClass",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyUri",
                                "value": ont.full_uri(cmd.domain),
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyId",
                                "value":
                                ont.full_uri(node_id_old2new[cmd.node_id]),
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
            elif isinstance(cmd, SetInternalLinkCmd):
                # TODO: comment out because old KARMA doesn't recognize this!
                # if cmd.target_uri is not None or cmd.source_uri is not None:
                #     worksheet_history.append({
                #         "commandName": "AddLinkCommand",
                #         "tags": ["Modeling"],
                #         "inputParameters": [
                #             {"name": "worksheetId", "value": "W", "type": "worksheetId"},
                #             {
                #                 "name": "edge",
                #                 "type": "other",
                #                 "value": {
                #                     "edgeId": ont.full_uri(cmd.link_lbl),
                #                     "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]),
                #                     "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]),
                #                     "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]),
                #                     "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1])
                #                 }
                #             },
                #             {"name": "inputColumns", "type": "hNodeIdList", "value": []},
                #             {"name": "outputColumns", "type": "hNodeIdList", "value": []}
                #         ]
                #     })
                # else:
                worksheet_history.append({
                    "commandName":
                    "ChangeInternalNodeLinksCommand",
                    "tags": ["Modeling"],
                    "inputParameters": [{
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name":
                        "initialEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id])
                        }]
                    }, {
                        "name":
                        "newEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id]),
                            "edgeTargetUri":
                            ont.full_uri(
                                cmd.target_uri
                                or node_id_old2new[cmd.target_id][:-1]),
                            "edgeSourceUri":
                            ont.full_uri(
                                cmd.source_uri
                                or node_id_old2new[cmd.source_id][:-1])
                        }]
                    }, {
                        "name": "inputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }, {
                        "name": "outputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }]
                })

        g.add((kr2rml, km_dev.hasInputColumns,
               Literal(ujson.dumps(input_columns))))
        g.add((kr2rml, km_dev.hasOutputColumns,
               Literal(ujson.dumps(output_columns))))
        g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id)))
        g.add((kr2rml, km_dev.hasBaseURI,
               Literal("http://localhost:8080/source/")))
        g.add((kr2rml, km_dev.hasWorksheetHistory,
               Literal(ujson.dumps(worksheet_history, indent=4))))

        g.serialize(str(fpath), format='n3')
Example #9
0
    def write_skos(self, directory):

        # parse the original v3 graph
        v3graph = RDFGraph()
        v3graph.parse(data=self.v3_skos)

        # create the namespace manager
        namespaces = (
            ("arches", ARCHES),
            ("skos", SKOS),
            ("dcterms", DCTERMS)
        )
        nsmanager = NamespaceManager(RDFGraph())
        for ns in namespaces:
            nsmanager.bind(ns[0], ns[1])

        # create the output graphs with the new namespace manager
        v4thesaurus = RDFGraph(namespace_manager=nsmanager)
        v4collections = RDFGraph(namespace_manager=nsmanager)

        # add the concept schemes to the thesaurus
        concept_schemes = [i for i in v3graph.triples((None, RDF.type, SKOS['ConceptScheme']))]
        for cs in concept_schemes:
            v4thesaurus.add(cs)

        # iterate the concepts and make collections for them.
        topconcepts = [i for i in v3graph.triples((None, SKOS['hasTopConcept'], None))]
        for tc in topconcepts:

            # get the top concept name and if convert it to a Literal object
            tc_name_literal = v3graph.value(subject=tc[2], predicate=SKOS['prefLabel'])

            # get the value from the JSON formatted Literal content
            # if the Literal content is NOT JSON, then this reference data was
            # exported from v3 with the wrong command and will not work.
            try:
                tc_name = json.loads(tc_name_literal.value)['value']
                collection_id = self.new_or_existing_uuid(tc_name)
            except ValueError:
                docs = "https://arches.readthedocs.io/en/stable/v3-to-v4-migration/"
                print("ERROR: Incompatible SKOS. See {} for more information.".format(docs))
                exit()

            if self.verbose:
                children = [i for i in v3graph.triples((tc[2], SKOS['narrower'], None))]
                print("{}: {} immediate child concepts".format(tc_name, len(children)))
                print("    collection uuid: "+collection_id)

            # create a new collection for each top concept
            v4thesaurus.add(tc)
            v4collections.add((ARCHES[collection_id], RDF.type, SKOS['Collection']))

            # add the preflabel for the collection, if it's not the r2r types collection
            # which already has a label in Arches by default.
            if tc_name != "Resource To Resource Relationship Types":
                simple_tc_name = Literal(tc_name, lang="en-US")
                v4collections.add((ARCHES[collection_id], SKOS['prefLabel'], simple_tc_name))

            # recursively add all of the concept children to the collection for this
            # top concept.
            v4collections = self.add_children_to_collection(v3graph, v4collections,
                                                            collection_id, tc[2])

        # add ALL concepts from the v3 graph to the thesaurus. this pulls along all
        # child/parent relationships into the thesaurus, as well as all extra info
        # for each concept, like sortorder, prefLabel, etc.
        for concept in v3graph.triples((None, RDF.type, SKOS['Concept'])):
            v4thesaurus.add(concept)

            # this is the extra info related to each concept, like prefLabel, sortorder, etc.
            for s, p, o in v3graph.triples((concept[0], None, None)):
                # skip the label of the resource to resource relationship type concept
                # as it's already in Arches and this would duplicate it.
                if s.endswith("000004") and p == SKOS['prefLabel']:
                    continue
                v4thesaurus.add((s, p, o))

        # export the thesaurus and collections to predetermined locations within the
        # package file structure.
        thesaurus_file = os.path.join(directory, 'concepts', 'thesaurus.xml')
        if self.verbose:
            print("writing thesaurus to: "+thesaurus_file)
        v4thesaurus.serialize(destination=thesaurus_file, format="pretty-xml")

        collections_file = os.path.join(directory, 'collections', 'collections.xml')
        if self.verbose:
            print("writing collections to: "+collections_file)
        v4collections.serialize(destination=collections_file, format="pretty-xml")
Example #10
0
 def parse(self, uri, fmt="ttl"):
     g = RDFGraph().parse(uri, format=fmt)
     for s, p, o in g.triples((None, None, None)):
         self.append((s, p, o))