コード例 #1
0
def add_ont_paths(graph: IntGraph, ont: Ontology, ont_graph: OntGraph) -> None:
    for u in graph.iter_class_nodes():
        for v in graph.iter_class_nodes():
            if u == v:
                continue

            c1 = next(ont_graph.iter_nodes_by_label(u.label))
            c2 = next(ont_graph.iter_nodes_by_label(v.label))
            possible_predicates = ont_graph.get_possible_predicates(
                ont.full_uri(c1.label.decode('utf-8')),
                ont.full_uri(c2.label.decode('utf-8')))

            for p in possible_predicates:
                p_lbl = ont.simplify_uri(p.uri).encode('utf-8')
                e = next((e for e in v.iter_incoming_links()
                          if e.source_id == u.id and e.label == p_lbl), None)
                if e is None:
                    e = IntGraphLink({Tag.ONT_GRAPH_SOURCE})
                    graph.real_add_new_link(e, GraphLinkType.UNSPECIFIED,
                                            p_lbl, u.id, v.id)
コード例 #2
0
    def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str,
                                                                    Path]):
        g = RDFGraph()
        km_dev = Namespace("http://isi.edu/integration/karma/dev#")
        g.namespace_manager.bind("km-dev", km_dev)
        kr2rml = BNode()

        g.add((kr2rml, RDF.type, km_dev.R2RMLMapping))
        g.add((kr2rml, km_dev.sourceName, Literal(tbl.id)))
        # timestamp and version, doesn't need to be precise
        g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264)))
        g.add((kr2rml, km_dev.modelVersion, Literal("1.7")))

        input_columns = []
        output_columns = []
        # mapping from Schema attribute path OR Command to KarmaColumns
        attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd],
                          List[Dict[str, str]]] = {}

        for attr_path in tbl.schema.get_attr_paths():
            input_columns.append([{
                "columnName": x
            } for x in attr_path.split(Schema.PATH_DELIMITER)])
            if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE:
                # default karma behaviour, you cannot set semantic type for higher level, but only "values"
                input_columns[-1].append({"columnName": "values"})
            output_columns.append(input_columns[-1])
            attr2hnodes[attr_path] = input_columns[-1]

        for cmd in self.commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                new_attr_path = cmd.input_attr_paths[0].split(
                    Schema.PATH_DELIMITER)[:-1]
                new_attr_path.append(cmd.new_attr_name)
                new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path)

                # when you create a new column from a list, karma convert to a list of objects
                # e.g: birth_death_date.values, create col death date from that,
                # Karma create => birth_death_date.death_date
                # that's why we have this code below
                new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1]
                new_hnode.append({"columnName": cmd.new_attr_name})

                output_columns.append(new_hnode)
                attr2hnodes[cmd] = output_columns[-1]
                attr2hnodes[new_attr_path] = output_columns[-1]

        worksheet_history = []
        # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1)
        commands = [
            cmd for cmd in self.commands
            if isinstance(cmd, PyTransformNewColumnCmd)
        ]
        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)],
                key=lambda c: c.node_id):
            commands.append(cmd)

        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetInternalLinkCmd)],
                key=lambda c: c.target_uri or c.source_uri or ""):
            commands.append(cmd)

        # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma
        # need to re-arrange the node_id
        node_id_old2new: Dict[str, str] = {}
        node_id_domain_count: Dict[str, int] = {}

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pass
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.node_id not in node_id_old2new:
                    node_id_domain_count[
                        cmd.domain] = node_id_domain_count.get(cmd.domain,
                                                               0) + 1
                    node_id_old2new[
                        cmd.
                        node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}"
            elif isinstance(cmd, SetInternalLinkCmd):
                if cmd.source_id not in node_id_old2new:
                    assert cmd.source_uri is not None
                    node_id_domain_count[
                        cmd.source_uri] = node_id_domain_count.get(
                            cmd.source_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}"
                if cmd.target_id not in node_id_old2new:
                    assert cmd.target_uri is not None
                    node_id_domain_count[
                        cmd.target_uri] = node_id_domain_count.get(
                            cmd.target_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}"

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pytransform_code = cmd.code
                # recover pytransform_code from our code
                pytransform_code = pytransform_code.replace(
                    "__return__ = ", "return ")
                for match in reversed(
                        list(
                            re.finditer("getValue\(([^)]+)\)",
                                        pytransform_code))):
                    start, end = match.span(1)
                    field = pytransform_code[start:end].replace(
                        "'", "").replace('"""', "").replace('"', '')
                    # convert full name to last column name since Karma use last column name instead
                    for input_attr_path in cmd.input_attr_paths:
                        if input_attr_path == field:
                            # TODO: will Karma always use last column name?
                            field = attr2hnodes[input_attr_path][-1][
                                'columnName']
                            break
                    else:
                        assert False, f"Cannot find any field {field} in the input columns"
                    pytransform_code = pytransform_code[:
                                                        start] + f'"{field}"' + pytransform_code[
                                                            end:]

                worksheet_history.append({
                    "tags": ["Transformation"],
                    "commandName":
                    "SubmitPythonTransformationCommand",
                    "inputParameters": [{
                        "name":
                        "hNodeId",
                        "value":
                        attr2hnodes[cmd.input_attr_paths[0]],
                        "type":
                        "hNodeId"
                    }, {
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name": "selectionName",
                        "value": "DEFAULT_TEST",
                        "type": "other"
                    }, {
                        "name": "newColumnName",
                        "value": cmd.new_attr_name,
                        "type": "other"
                    }, {
                        "name": "transformationCode",
                        "value": pytransform_code,
                        "type": "other"
                    }, {
                        "name": "errorDefaultValue",
                        "value": cmd.default_error_value,
                        "type": "other"
                    }, {
                        "name":
                        "inputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value": attr2hnodes[iap]
                        } for iap in cmd.input_attr_paths])
                    }, {
                        "name":
                        "outputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value":
                            attr2hnodes[cmd]
                            if attr2hnodes[cmd][-1]['columnName'] != "values"
                            else attr2hnodes[cmd][:-1]
                        }])
                    }]
                })
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.type != "karma:classLink":
                    worksheet_history.append({
                        "commandName":
                        "SetSemanticTypeCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
                else:
                    worksheet_history.append({
                        "commandName":
                        "SetMetaPropertyCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyName",
                                "value": "isUriOfClass",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyUri",
                                "value": ont.full_uri(cmd.domain),
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyId",
                                "value":
                                ont.full_uri(node_id_old2new[cmd.node_id]),
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
            elif isinstance(cmd, SetInternalLinkCmd):
                # TODO: comment out because old KARMA doesn't recognize this!
                # if cmd.target_uri is not None or cmd.source_uri is not None:
                #     worksheet_history.append({
                #         "commandName": "AddLinkCommand",
                #         "tags": ["Modeling"],
                #         "inputParameters": [
                #             {"name": "worksheetId", "value": "W", "type": "worksheetId"},
                #             {
                #                 "name": "edge",
                #                 "type": "other",
                #                 "value": {
                #                     "edgeId": ont.full_uri(cmd.link_lbl),
                #                     "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]),
                #                     "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]),
                #                     "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]),
                #                     "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1])
                #                 }
                #             },
                #             {"name": "inputColumns", "type": "hNodeIdList", "value": []},
                #             {"name": "outputColumns", "type": "hNodeIdList", "value": []}
                #         ]
                #     })
                # else:
                worksheet_history.append({
                    "commandName":
                    "ChangeInternalNodeLinksCommand",
                    "tags": ["Modeling"],
                    "inputParameters": [{
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name":
                        "initialEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id])
                        }]
                    }, {
                        "name":
                        "newEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id]),
                            "edgeTargetUri":
                            ont.full_uri(
                                cmd.target_uri
                                or node_id_old2new[cmd.target_id][:-1]),
                            "edgeSourceUri":
                            ont.full_uri(
                                cmd.source_uri
                                or node_id_old2new[cmd.source_id][:-1])
                        }]
                    }, {
                        "name": "inputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }, {
                        "name": "outputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }]
                })

        g.add((kr2rml, km_dev.hasInputColumns,
               Literal(ujson.dumps(input_columns))))
        g.add((kr2rml, km_dev.hasOutputColumns,
               Literal(ujson.dumps(output_columns))))
        g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id)))
        g.add((kr2rml, km_dev.hasBaseURI,
               Literal("http://localhost:8080/source/")))
        g.add((kr2rml, km_dev.hasWorksheetHistory,
               Literal(ujson.dumps(worksheet_history, indent=4))))

        g.serialize(str(fpath), format='n3')
コード例 #3
0
    def to_normalized_json_model(self, ont: Ontology = None) -> dict:
        """Dump the normalized/changed model back to karma model JSON format

        Few changes:
            + All id are converted from int to str so that it's compatible with source_id of link (str type due to split("---")
            + LiteralNodes is converted to ColumnNode (we are going to treat LiteralNode as a column contains only one value)
                and an new column name will be generated for LiteralNodes

        An optional ontology to restore URI from simplified version (e.g: crm:E39_Actor)
        to full version (http://www.cidoc-crm.org/cidoc-crm/E39_Actor)
        """
        nodes = []
        links = []

        if ont is None:
            ont = UselessOntology()

        # add literal nodes to source_columns
        source_columns = [{
            "id": str(col.id),
            "hNodeId": str(col.h_node_id),
            "columnName": col.column_name
        } for col in self.source_columns]
        count = len(self.source_columns)
        for node in self.karma_graph.iter_data_nodes():
            if node.is_literal_node:
                source_columns.append({
                    "id":
                    str(node.id),
                    "hNodeId":
                    str(node.id),
                    "columnName":
                    "A%d__literal_val_%s" %
                    (count, node.label.decode('utf-8').lower().replace(
                        " ", "-"))
                })
                count += 1
        colid2name: Dict[int, str] = {
            int(col['id']): col["columnName"]
            for col in source_columns
        }

        for node in self.karma_graph.iter_nodes():
            onode = {
                "id": str(node.id),
                "modelIds": None,
                "type":
                "InternalNode" if node.is_class_node() else "ColumnNode",
                "label": {
                    "uri": node.label.decode("utf-8")
                }
            }
            if node.is_data_node():
                onode["hNodeId"] = str(node.id)
                onode["columnName"] = colid2name[node.id]
                if node.literal_type is None:
                    onode["rdfLiteralType"] = None
                else:
                    onode["rdfLiteralType"] = {"uri": node.literal_type}

                if node.is_literal_node:
                    parent_link = node.get_first_incoming_link()
                    onode["userSemanticTypes"] = [{
                        "hNodeId": str(node.id),
                        "domain": {
                            "uri":
                            ont.full_uri(
                                parent_link.get_source_node().label.decode(
                                    "utf-8")),
                            "rdfsLabel":
                            None
                        },
                        "type": {
                            "uri":
                            ont.full_uri(parent_link.label.decode("utf-8")),
                            "rdfsLabel": None
                        },
                        "origin": "User",
                        "confidenceScore": 1.0
                    }]
                    onode["learnedSemanticTypes"] = []
                else:
                    onode["userSemanticTypes"] = [{
                        "hNodeId":
                        str(node.id),
                        "domain": {
                            "uri": ont.full_uri(st.domain),
                            "rdfsLabel": None
                        },
                        "type": {
                            "uri": ont.full_uri(st.type),
                            "rdfsLabel": None
                        },
                        "origin":
                        st.origin,
                        "confidenceScore":
                        st.confidence_score
                    } for st in node.user_semantic_types]
                    onode["learnedSemanticTypes"] = [{
                        "hNodeId":
                        str(node.id),
                        "domain": {
                            "uri": ont.full_uri(st.domain),
                            "rdfsLabel": None
                        },
                        "type": {
                            "uri": ont.full_uri(st.type),
                            "rdfsLabel": None
                        },
                        "origin":
                        "AutoModel",
                        "confidenceScore":
                        st.confidence_score
                    } for st in node.learned_semantic_types]
            else:
                onode["label"]["uri"] = ont.full_uri(onode["label"]["uri"])

            nodes.append(onode)

        for link in self.karma_graph.iter_links():
            if link.type == GraphLinkType.OBJECT_PROPERTY:
                link_type = 'ObjectPropertyLink'
            elif link.type == GraphLinkType.DATA_PROPERTY:
                link_type = 'DataPropertyLink'
            elif link.label == 'karma:dev':
                link_type = 'ClassInstanceLink'
            elif link.get_target_node().is_data_node():
                link_type = "DataPropertyLink"
            elif link.get_target_node().is_class_node():
                link_type = "ObjectPropertyLink"

            olink = {
                "id":
                "%s---%s---%s" %
                (link.source_id, link.label.decode("utf-8"), link.target_id),
                "weight":
                None,
                "type":
                link_type,
                "label": {
                    "uri": ont.full_uri(link.label.decode("utf-8"))
                },
                "objectPropertyType":
                "Indirect",
                "status":
                "Normal",
                "keyInfo":
                "None",
                "modelIds":
                None
            }
            links.append(olink)

        model_json = {
            "id":
            self.id,
            "name":
            self.id,
            "description":
            self.description,
            "sourceColumns":
            source_columns,
            "mappingToSourceColumns": [{
                "id": col["id"],
                "sourceColumnId": col["id"]
            } for col in source_columns],
            "graph": {
                "nodes": nodes,
                "links": links
            }
        }
        return model_json