Esempio n. 1
0
    def from_json(obj: dict, ont: Ontology) -> 'SSD':
        g = Graph(True, True, True)
        node2attr = {x['node']: x['attribute'] for x in obj['mappings']}
        idmap = {}
        raw_attributes = {}
        for raw_attr in obj['attributes']:
            assert len(raw_attr['columnIds']
                       ) == 1 and raw_attr['columnIds'][0] == raw_attr['id']
            raw_attributes[raw_attr['id']] = raw_attr

        attrs = []
        for n in obj['semanticModel']['nodes']:
            if n['type'] == 'DataNode':
                node_type = GraphNodeType.DATA_NODE
                attr = raw_attributes[node2attr[n['id']]]
                n_lbl = attr['name']
                attrs.append(SSDAttribute(n['id'], n_lbl))
            else:
                node_type = GraphNodeType.CLASS_NODE
                n_lbl = n['prefix'] + n['label']
                n_lbl = ont.simplify_uri(n_lbl)

            idmap[n['id']] = g.add_new_node(node_type, n_lbl.encode()).id

        for e in obj['semanticModel']['links']:
            e_lbl = e['prefix'] + e['label']
            e_lbl = ont.simplify_uri(e_lbl)
            g.add_new_link(GraphLinkType.UNSPECIFIED, e_lbl.encode(),
                           idmap[e['source']], idmap[e['target']])

        return SSD(obj['name'], attrs, g, ont)
Esempio n. 2
0
    def from_karma_model(
        node: dict, ont: Ontology, id2columns: Dict[str, 'KarmaSourceColumn']
    ) -> Tuple['KarmaGraphNode', int, bytes]:
        assert node['type'] in {'ColumnNode', 'InternalNode', 'LiteralNode'
                                }, "Not recognized type: %s" % node['type']
        if node['type'] in {'ColumnNode', 'LiteralNode'}:
            type = GraphNodeType.DATA_NODE
        else:
            assert node['type'] == "InternalNode", node['type']
            type = GraphNodeType.CLASS_NODE
        is_literal_node = False

        if type == GraphNodeType.DATA_NODE:
            # IMPORTANT: this related to the SourceColumn::get_unique_column_name
            if node['type'] == 'LiteralNode':
                # trying to make short & readable label using heuristic
                label = node['value']
                is_literal_node = True
            else:
                label = id2columns[node['id']].column_name
        else:
            label = ont.simplify_uri(node['label']['uri'])

        user_semantic_types = []
        if 'user_semantic_types' in node:
            for x in node['user_semantic_types']:
                x = _dict_camel_to_snake(x)
                x['domain'] = ont.simplify_uri(x['domain']['uri'])
                x['type'] = ont.simplify_uri(x['type']['uri'])
                user_semantic_types.append(KarmaSemanticType(**x))

        # because there is duplication in data sources, now we filter out duplicated semantic types
        user_semantic_types = unique_values(user_semantic_types,
                                            key=lambda n: n.get_hashing_id())

        learned_semantic_types = []
        if 'learned_semantic_types' in node and node[
                'learned_semantic_types'] is not None:
            for x in node['learned_semantic_types']:
                x = _dict_camel_to_snake(x)
                x['domain'] = ont.simplify_uri(x['domain']['uri'])
                x['type'] = ont.simplify_uri(x['type']['uri'])
                x['h_node_id'] = node['h_node_id']
                learned_semantic_types.append(KarmaSemanticType(**x))

        # double check data
        assert node['model_ids'] is None
        if 'rdf_literal_type' not in node or node['rdf_literal_type'] is None:
            literal_type = None
        else:
            literal_type = node['rdf_literal_type']['uri']
        return KarmaGraphNode(user_semantic_types, learned_semantic_types,
                              literal_type,
                              is_literal_node), type, label.encode('utf-8')
Esempio n. 3
0
def add_ont_paths(graph: IntGraph, ont: Ontology, ont_graph: OntGraph) -> None:
    for u in graph.iter_class_nodes():
        for v in graph.iter_class_nodes():
            if u == v:
                continue

            c1 = next(ont_graph.iter_nodes_by_label(u.label))
            c2 = next(ont_graph.iter_nodes_by_label(v.label))
            possible_predicates = ont_graph.get_possible_predicates(
                ont.full_uri(c1.label.decode('utf-8')),
                ont.full_uri(c2.label.decode('utf-8')))

            for p in possible_predicates:
                p_lbl = ont.simplify_uri(p.uri).encode('utf-8')
                e = next((e for e in v.iter_incoming_links()
                          if e.source_id == u.id and e.label == p_lbl), None)
                if e is None:
                    e = IntGraphLink({Tag.ONT_GRAPH_SOURCE})
                    graph.real_add_new_link(e, GraphLinkType.UNSPECIFIED,
                                            p_lbl, u.id, v.id)
Esempio n. 4
0
    def from_karma_model(
            link: dict,
            ont: Ontology) -> Tuple['KarmaGraphLink', int, bytes, int, int]:
        if link['type'] == 'ObjectPropertyLink':
            link_type = GraphLinkType.OBJECT_PROPERTY
        elif link['type'] == 'DataPropertyLink':
            link_type = GraphLinkType.DATA_PROPERTY
        else:
            assert link['type'] == 'ClassInstanceLink'
            link_type = GraphLinkType.URI_PROPERTY

        return KarmaGraphLink(link['weight']), link_type, ont.simplify_uri(
            link['label']['uri']).encode(
                'utf-8'), link['source_id'], link['target_id']
Esempio n. 5
0
def get_ontology(dataset: str) -> Ontology:
    """Get ontology of a given dataset"""
    global _data_io_vars
    if dataset not in _data_io_vars["ont"]:
        # if it has been cached ...
        cache_file = get_cache_dir(dataset) / 'ont.pkl'
        cache_file.parent.mkdir(exist_ok=True, parents=True)
        if cache_file.exists():
            ont = deserialize(cache_file)
        else:
            ont = Ontology.from_dataset(dataset)
            serialize(ont, cache_file)
        _data_io_vars["ont"][dataset] = ont

    return _data_io_vars["ont"][dataset]
Esempio n. 6
0
def create_node_args(
        ont: Ontology, cls: OntoClass
) -> Optional[Tuple[int, bytes, str, Set[str], Set[str]]]:
    if not filter_uri(str(cls.uri)):
        return None
    if is_data_node(str(cls.uri)):
        node_type = GraphNodeType.DATA_NODE
    else:
        node_type = GraphNodeType.CLASS_NODE

    if cls.sparqlHelper is not None:
        parents = {
            str(x[0])
            for x in cls.sparqlHelper.getClassAllSupers(cls.uri)
        }
    else:
        parents = set()
    children = set()
    return node_type, ont.simplify_uri(str(cls.uri)).encode('utf-8'), str(
        cls.uri), parents, children
Esempio n. 7
0
def build_ont_graph(dataset: str) -> OntGraph:
    ont = Ontology.from_dataset(dataset)
    ont_graph: OntGraph = OntGraph(dataset)
    predicates: Dict[str, Predicate] = {}

    for ont_name, ont_conf in config.datasets[dataset].ontology.items():
        fpaths = []
        if 'fpath' in ont_conf:
            fpaths = [ont_conf.fpath]
        elif 'fpaths' in ont_conf:
            fpaths = [ont_conf.fpaths]

        for fpath in fpaths:
            g = ontospy.Ontospy(str(fpath.as_path()))
            is_rdf_type_reliable = False

            for cls in g.classes:
                add_node(ont, ont_graph, cls)

            for prop in g.properties:
                for rg in prop.ranges:
                    add_node(ont, ont_graph, rg)
                for domain in prop.domains:
                    add_node(ont, ont_graph, domain)

                try:
                    predicate = Predicate(str(prop.uri),
                                          [str(x.uri) for x in prop.domains],
                                          [str(x.uri) for x in prop.ranges],
                                          ont.simplify_uri(str(prop.rdftype)),
                                          False, {ont_name})

                    if str(prop.uri) in predicates:
                        predicates[str(prop.uri)].merge(predicate)
                    else:
                        predicates[str(prop.uri)] = predicate

                    if predicate.rdf_type in {
                            PredicateType.OWL_DATA_PROP,
                            PredicateType.OWL_OBJECT_PROP
                    }:
                        is_rdf_type_reliable = True
                except Exception:
                    print(ont_name, prop)
                    print(prop.__dict__)
                    raise

            for uri, predicate in predicates.items():
                if ont_name in predicate.defined_in_onts:
                    predicate.is_rdf_type_reliable = is_rdf_type_reliable

    ont_graph.set_predicates(list(predicates.values()))
    # update parent & children between nodes
    for node in ont_graph.iter_nodes():
        for node_uri in node.parents_uris.union(node.children_uris):
            if not ont_graph.has_node_with_uri(node_uri):
                # node is referred by subClassOf but never been defined before
                ont_graph.add_new_node(
                    GraphNodeType.CLASS_NODE,
                    ont.simplify_uri(node_uri).encode('utf-8'), node_uri,
                    set(), set())

    for node in ont_graph.iter_nodes():
        for parent_uri in node.parents_uris:
            ont_graph.get_node_by_uri(parent_uri).children_uris.add(node.uri)
        for child_uri in node.children_uris:
            ont_graph.get_node_by_uri(child_uri).parents_uris.add(node.uri)
    return ont_graph
Esempio n. 8
0
    def to_kr2rml(self, ont: Ontology, tbl: DataTable, fpath: Union[str,
                                                                    Path]):
        g = RDFGraph()
        km_dev = Namespace("http://isi.edu/integration/karma/dev#")
        g.namespace_manager.bind("km-dev", km_dev)
        kr2rml = BNode()

        g.add((kr2rml, RDF.type, km_dev.R2RMLMapping))
        g.add((kr2rml, km_dev.sourceName, Literal(tbl.id)))
        # timestamp and version, doesn't need to be precise
        g.add((kr2rml, km_dev.modelPublicationTime, Literal(1414133381264)))
        g.add((kr2rml, km_dev.modelVersion, Literal("1.7")))

        input_columns = []
        output_columns = []
        # mapping from Schema attribute path OR Command to KarmaColumns
        attr2hnodes: Dict[Union[str, PyTransformNewColumnCmd],
                          List[Dict[str, str]]] = {}

        for attr_path in tbl.schema.get_attr_paths():
            input_columns.append([{
                "columnName": x
            } for x in attr_path.split(Schema.PATH_DELIMITER)])
            if tbl.schema.get_attr_type(attr_path) == Schema.LIST_VALUE:
                # default karma behaviour, you cannot set semantic type for higher level, but only "values"
                input_columns[-1].append({"columnName": "values"})
            output_columns.append(input_columns[-1])
            attr2hnodes[attr_path] = input_columns[-1]

        for cmd in self.commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                new_attr_path = cmd.input_attr_paths[0].split(
                    Schema.PATH_DELIMITER)[:-1]
                new_attr_path.append(cmd.new_attr_name)
                new_attr_path = Schema.PATH_DELIMITER.join(new_attr_path)

                # when you create a new column from a list, karma convert to a list of objects
                # e.g: birth_death_date.values, create col death date from that,
                # Karma create => birth_death_date.death_date
                # that's why we have this code below
                new_hnode = attr2hnodes[cmd.input_attr_paths[0]][:-1]
                new_hnode.append({"columnName": cmd.new_attr_name})

                output_columns.append(new_hnode)
                attr2hnodes[cmd] = output_columns[-1]
                attr2hnodes[new_attr_path] = output_columns[-1]

        worksheet_history = []
        # re-arrange commands to fit the issue of node id = Concept2 (Karma will convert Concept2 to Concept1)
        commands = [
            cmd for cmd in self.commands
            if isinstance(cmd, PyTransformNewColumnCmd)
        ]
        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetSemanticTypeCmd)],
                key=lambda c: c.node_id):
            commands.append(cmd)

        for cmd in sorted(
            [c for c in self.commands if isinstance(c, SetInternalLinkCmd)],
                key=lambda c: c.target_uri or c.source_uri or ""):
            commands.append(cmd)

        # sometime the model use incorrect node id like: node id = Concept7 (no Concept1..6), will result as an error in Karma
        # need to re-arrange the node_id
        node_id_old2new: Dict[str, str] = {}
        node_id_domain_count: Dict[str, int] = {}

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pass
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.node_id not in node_id_old2new:
                    node_id_domain_count[
                        cmd.domain] = node_id_domain_count.get(cmd.domain,
                                                               0) + 1
                    node_id_old2new[
                        cmd.
                        node_id] = f"{cmd.domain}{node_id_domain_count[cmd.domain]}"
            elif isinstance(cmd, SetInternalLinkCmd):
                if cmd.source_id not in node_id_old2new:
                    assert cmd.source_uri is not None
                    node_id_domain_count[
                        cmd.source_uri] = node_id_domain_count.get(
                            cmd.source_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        source_id] = f"{cmd.source_uri}{node_id_domain_count[cmd.source_uri]}"
                if cmd.target_id not in node_id_old2new:
                    assert cmd.target_uri is not None
                    node_id_domain_count[
                        cmd.target_uri] = node_id_domain_count.get(
                            cmd.target_uri, 0) + 1
                    node_id_old2new[
                        cmd.
                        target_id] = f"{cmd.target_uri}{node_id_domain_count[cmd.target_uri]}"

        for cmd in commands:
            if isinstance(cmd, PyTransformNewColumnCmd):
                pytransform_code = cmd.code
                # recover pytransform_code from our code
                pytransform_code = pytransform_code.replace(
                    "__return__ = ", "return ")
                for match in reversed(
                        list(
                            re.finditer("getValue\(([^)]+)\)",
                                        pytransform_code))):
                    start, end = match.span(1)
                    field = pytransform_code[start:end].replace(
                        "'", "").replace('"""', "").replace('"', '')
                    # convert full name to last column name since Karma use last column name instead
                    for input_attr_path in cmd.input_attr_paths:
                        if input_attr_path == field:
                            # TODO: will Karma always use last column name?
                            field = attr2hnodes[input_attr_path][-1][
                                'columnName']
                            break
                    else:
                        assert False, f"Cannot find any field {field} in the input columns"
                    pytransform_code = pytransform_code[:
                                                        start] + f'"{field}"' + pytransform_code[
                                                            end:]

                worksheet_history.append({
                    "tags": ["Transformation"],
                    "commandName":
                    "SubmitPythonTransformationCommand",
                    "inputParameters": [{
                        "name":
                        "hNodeId",
                        "value":
                        attr2hnodes[cmd.input_attr_paths[0]],
                        "type":
                        "hNodeId"
                    }, {
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name": "selectionName",
                        "value": "DEFAULT_TEST",
                        "type": "other"
                    }, {
                        "name": "newColumnName",
                        "value": cmd.new_attr_name,
                        "type": "other"
                    }, {
                        "name": "transformationCode",
                        "value": pytransform_code,
                        "type": "other"
                    }, {
                        "name": "errorDefaultValue",
                        "value": cmd.default_error_value,
                        "type": "other"
                    }, {
                        "name":
                        "inputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value": attr2hnodes[iap]
                        } for iap in cmd.input_attr_paths])
                    }, {
                        "name":
                        "outputColumns",
                        "type":
                        "hNodeIdList",
                        "value":
                        ujson.dumps([{
                            "value":
                            attr2hnodes[cmd]
                            if attr2hnodes[cmd][-1]['columnName'] != "values"
                            else attr2hnodes[cmd][:-1]
                        }])
                    }]
                })
            elif isinstance(cmd, SetSemanticTypeCmd):
                if cmd.type != "karma:classLink":
                    worksheet_history.append({
                        "commandName":
                        "SetSemanticTypeCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
                else:
                    worksheet_history.append({
                        "commandName":
                        "SetMetaPropertyCommand",
                        "tags": ["Modeling"],
                        "inputParameters": [
                            {
                                "name": "hNodeId",
                                "value": attr2hnodes[cmd.input_attr_path],
                                "type": "hNodeId"
                            },
                            {
                                "name": "worksheetId",
                                "value": "W",
                                "type": "worksheetId"
                            },
                            {
                                "name": "selectionName",
                                "value": "DEFAULT_TEST",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyName",
                                "value": "isUriOfClass",
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyUri",
                                "value": ont.full_uri(cmd.domain),
                                "type": "other"
                            },
                            {
                                "name": "metaPropertyId",
                                "value":
                                ont.full_uri(node_id_old2new[cmd.node_id]),
                                "type": "other"
                            },
                            {
                                "name":
                                "SemanticTypesArray",
                                "type":
                                "other",
                                "value": [{
                                    "FullType":
                                    ont.full_uri(cmd.type),
                                    "isPrimary":
                                    True,
                                    "DomainLabel":
                                    ont.simplify_uri(
                                        node_id_old2new[cmd.node_id]),
                                    "DomainId":
                                    ont.full_uri(node_id_old2new[cmd.node_id]),
                                    "DomainUri":
                                    ont.full_uri(cmd.domain)
                                }]
                            },
                            {
                                "name": "trainAndShowUpdates",
                                "value": False,
                                "type": "other"
                            },
                            {
                                "name": "rdfLiteralType",
                                "value": "",
                                "type": "other"
                            },  # TODO: update correct RDF-Literal-Type
                            {
                                "name":
                                "inputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            },
                            {
                                "name":
                                "outputColumns",
                                "type":
                                "hNodeIdList",
                                "value":
                                ujson.dumps([{
                                    "value":
                                    attr2hnodes[cmd.input_attr_path]
                                }])
                            }
                        ]
                    })
            elif isinstance(cmd, SetInternalLinkCmd):
                # TODO: comment out because old KARMA doesn't recognize this!
                # if cmd.target_uri is not None or cmd.source_uri is not None:
                #     worksheet_history.append({
                #         "commandName": "AddLinkCommand",
                #         "tags": ["Modeling"],
                #         "inputParameters": [
                #             {"name": "worksheetId", "value": "W", "type": "worksheetId"},
                #             {
                #                 "name": "edge",
                #                 "type": "other",
                #                 "value": {
                #                     "edgeId": ont.full_uri(cmd.link_lbl),
                #                     "edgeTargetId": ont.full_uri(node_id_old2new[cmd.target_id]),
                #                     "edgeTargetUri": ont.full_uri(cmd.target_uri or cmd.target_id[:-1]),
                #                     "edgeSourceId": ont.full_uri(node_id_old2new[cmd.source_id]),
                #                     "edgeSourceUri": ont.full_uri(cmd.source_uri or cmd.source_id[:-1])
                #                 }
                #             },
                #             {"name": "inputColumns", "type": "hNodeIdList", "value": []},
                #             {"name": "outputColumns", "type": "hNodeIdList", "value": []}
                #         ]
                #     })
                # else:
                worksheet_history.append({
                    "commandName":
                    "ChangeInternalNodeLinksCommand",
                    "tags": ["Modeling"],
                    "inputParameters": [{
                        "name": "worksheetId",
                        "value": "W",
                        "type": "worksheetId"
                    }, {
                        "name":
                        "initialEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id])
                        }]
                    }, {
                        "name":
                        "newEdges",
                        "type":
                        "other",
                        "value": [{
                            "edgeId":
                            ont.full_uri(cmd.link_lbl),
                            "edgeTargetId":
                            ont.full_uri(node_id_old2new[cmd.target_id]),
                            "edgeSourceId":
                            ont.full_uri(node_id_old2new[cmd.source_id]),
                            "edgeTargetUri":
                            ont.full_uri(
                                cmd.target_uri
                                or node_id_old2new[cmd.target_id][:-1]),
                            "edgeSourceUri":
                            ont.full_uri(
                                cmd.source_uri
                                or node_id_old2new[cmd.source_id][:-1])
                        }]
                    }, {
                        "name": "inputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }, {
                        "name": "outputColumns",
                        "type": "hNodeIdList",
                        "value": []
                    }]
                })

        g.add((kr2rml, km_dev.hasInputColumns,
               Literal(ujson.dumps(input_columns))))
        g.add((kr2rml, km_dev.hasOutputColumns,
               Literal(ujson.dumps(output_columns))))
        g.add((kr2rml, km_dev.hasModelLabel, Literal(tbl.id)))
        g.add((kr2rml, km_dev.hasBaseURI,
               Literal("http://localhost:8080/source/")))
        g.add((kr2rml, km_dev.hasWorksheetHistory,
               Literal(ujson.dumps(worksheet_history, indent=4))))

        g.serialize(str(fpath), format='n3')
Esempio n. 9
0
 def __init__(self, ont: Ontology) -> None:
     self.choices = ont.get_classes()
     self.extended_choices = []
Esempio n. 10
0
 def get_predicate_completer(ont: Ontology) -> 'StringCompleter':
     return StringCompleter(ont.get_predicates())
Esempio n. 11
0
    def __init__(self, ont: Ontology, tbl: DataTable,
                 kr2rml_file: Path) -> None:
        g = rdflib.Graph(store=IOMemory())
        g.parse(location=str(kr2rml_file), format="n3")

        worksheet_history = list(
            g.triples(
                (None,
                 URIRef(
                     "http://isi.edu/integration/karma/dev#hasWorksheetHistory"
                 ), None)))
        assert len(worksheet_history) == 1
        worksheet_history = ujson.loads(worksheet_history[0][-1])

        input_columns = list(
            g.triples((
                None,
                URIRef("http://isi.edu/integration/karma/dev#hasInputColumns"),
                None)))
        assert len(input_columns) == 1
        input_columns = ujson.loads(input_columns[0][-1])

        # construct mapping between kr2rml attribute paths to tbl_attr_paths
        tbl_attr_paths = tbl.schema.get_attr_paths()
        n_attr_paths = len(tbl_attr_paths)
        tbl_attr_paths = {
            apath.replace("@", ""): apath
            for apath in tbl_attr_paths
        }
        assert len(tbl_attr_paths) == n_attr_paths

        start_idx = 0
        for i, cname in enumerate(input_columns[0]):
            cpath = Schema.PATH_DELIMITER.join(
                cname['columnName'] for cname in input_columns[0][i:])
            # cname = Schema.PATH_DELIMITERinput_columns[i:]) cname['columnName'] + Schema.PATH_DELIMITER
            found_attr = False
            for attr_path in tbl_attr_paths:
                if (attr_path + Schema.PATH_DELIMITER).startswith(cpath):
                    found_attr = True
                    break
            if found_attr:
                start_idx = i
                break

        literal_nodes = {}
        col2col = {}
        for col in input_columns:
            attr_path = Schema.PATH_DELIMITER.join(
                cname['columnName'] for cname in col[start_idx:])
            if attr_path not in tbl_attr_paths:
                attr_path = Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in col[start_idx:-1])
                if col[-1]['columnName'] == 'Values':
                    assert attr_path in tbl_attr_paths
                elif col[-1]['columnName'] == 'content':
                    attr_path += Schema.PATH_DELIMITER + "#text"
                    assert attr_path in tbl_attr_paths
                else:
                    raise ValueError(
                        f"Invalid column type: {col[-1]['columnName']}")

            col2col[Schema.PATH_DELIMITER.join(
                cname['columnName']
                for cname in col)] = tbl_attr_paths[attr_path]
        assert len(set(
            col2col.values())) == len(input_columns), "No duplication"

        # extracting commands
        commands = []
        for command in worksheet_history:
            if command['commandName'] == "SubmitPythonTransformationCommand":
                cmd_start_col = command['inputParameters'][0]
                cmd_input_parent_col = Schema.PATH_DELIMITER.join(
                    [col['columnName'] for col in cmd_start_col['value'][:-1]])
                cmd_input_col = command['inputParameters'][-2]
                cmd_output_col = command['inputParameters'][-1]

                if command['inputParameters'][-3]['name'] == 'isJSONOutput':
                    cmd_code = command['inputParameters'][-5]
                    default_error_value = command['inputParameters'][-4]
                    assert command['inputParameters'][-3]['value'] == "false"
                else:
                    default_error_value = command['inputParameters'][-3]
                    cmd_code = command['inputParameters'][-4]

                assert cmd_input_col['name'] == "inputColumns" and cmd_output_col[
                    "name"] == "outputColumns" and cmd_code[
                        'name'] == 'transformationCode' and default_error_value[
                            'name'] == 'errorDefaultValue'
                cmd_input_cols = [[
                    cname['columnName'] for cname in o['value']
                ] for o in ujson.loads(cmd_input_col['value'])]
                karma_input_attr_paths = [
                    col2col[Schema.PATH_DELIMITER.join(cmd_input_col)]
                    for cmd_input_col in cmd_input_cols
                ]

                # update col2col because of new columns
                new_attr_name = ujson.loads(
                    cmd_output_col['value'])[0]['value'][-1]['columnName']
                new_attr_path = new_attr_name if cmd_input_parent_col == "" else (
                    cmd_input_parent_col + Schema.PATH_DELIMITER +
                    new_attr_name)
                cmd_output_col = Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in ujson.loads(
                        cmd_output_col['value'])[0]['value'])
                col2col[cmd_output_col] = new_attr_path

                cmd_code = cmd_code['value'].replace("return ",
                                                     "__return__ = ")
                input_attr_paths = []
                for match in reversed(
                        list(re.finditer("getValue\(([^)]+)\)", cmd_code))):
                    start, end = match.span(1)
                    field = cmd_code[start:end].replace("'", "").replace(
                        '"""', "").replace('"', '')
                    # it seems that Karma use last column name, we need to recover full name
                    # using the provided input first
                    for cmd_input_col, input_attr_path in zip(
                            cmd_input_cols, karma_input_attr_paths):
                        if field == cmd_input_col[-1]:
                            field = input_attr_path
                            break
                    else:
                        # otherwise construct from the start columns
                        full_field = field if cmd_input_parent_col == "" else (
                            cmd_input_parent_col + Schema.PATH_DELIMITER +
                            field)
                        field = col2col[full_field]
                    cmd_code = cmd_code[:start] + f'"{field}"' + cmd_code[end:]

                    input_attr_paths.append(field)

                default_error_value = default_error_value['value']
                commands.append(
                    PyTransformNewColumnCmd(input_attr_paths, new_attr_name,
                                            cmd_code, default_error_value))
            elif command["commandName"] == "SetSemanticTypeCommand" or command[
                    "commandName"] == "SetMetaPropertyCommand":
                cmd_input_col = command['inputParameters'][-2]
                if command["inputParameters"][-5][
                        'name'] == 'SemanticTypesArray':
                    cmd_stype = command['inputParameters'][-5]
                else:
                    cmd_stype = command['inputParameters'][-6]

                if cmd_stype['name'] == 'SemanticTypesArray':
                    assert cmd_input_col['name'] == "inputColumns" and len(
                        cmd_stype['value']
                    ) == 1 and cmd_stype['value'][0]['isPrimary']
                    cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                        cname['columnName'] for cname in ujson.loads(
                            cmd_input_col['value'])[0]['value'])]
                    cmd_stype = cmd_stype['value'][0]

                    commands.append(
                        SetSemanticTypeCmd(
                            cmd_input_col,
                            domain=ont.simplify_uri(cmd_stype['DomainUri']),
                            type=ont.simplify_uri(cmd_stype['FullType']),
                            node_id=ont.simplify_uri(
                                cmd_stype['DomainId'].replace(" (add)", ""))))
                else:
                    cmd_stype_domain = command['inputParameters'][-7]
                    cmd_stype_id = command['inputParameters'][-6]
                    assert cmd_input_col['name'] == "inputColumns" and cmd_stype_domain['name'] == 'metaPropertyUri' \
                           and cmd_stype_id['name'] == 'metaPropertyId'
                    cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                        cname['columnName'] for cname in ujson.loads(
                            cmd_input_col['value'])[0]['value'])]

                    commands.append(
                        SetSemanticTypeCmd(
                            cmd_input_col,
                            domain=ont.simplify_uri(cmd_stype_domain['value']),
                            type="karma:classLink",
                            node_id=ont.simplify_uri(cmd_stype_id['value'])))
            elif command['commandName'] == 'UnassignSemanticTypeCommand':
                cmd_input_col = command['inputParameters'][-2]
                assert cmd_input_col['name'] == "inputColumns"
                cmd_input_col = col2col[Schema.PATH_DELIMITER.join(
                    cname['columnName'] for cname in ujson.loads(
                        cmd_input_col['value'])[0]['value'])]

                delete_cmds = []
                for i, cmd in enumerate(commands):
                    if isinstance(cmd, SetSemanticTypeCmd
                                  ) and cmd.input_attr_path == cmd_input_col:
                        delete_cmds.append(i)

                for i in reversed(delete_cmds):
                    commands.pop(i)
            elif command["commandName"] == "ChangeInternalNodeLinksCommand":
                cmd_edges = command['inputParameters'][-3]
                assert cmd_edges['name'] == 'newEdges'
                # cmd_initial_edges = command['inputParameters'][-4]
                # if cmd_initial_edges['name'] == 'initialEdges' and len(cmd_initial_edges['value']) > 0:
                #     delete_cmds = []
                #     for cmd_edge in cmd_initial_edges['value']:
                #         edge_lbl = ont.simplify_uri(cmd_edge['edgeId'])
                #         source_id = ont.simplify_uri(cmd_edge['edgeSourceId'])
                #
                #         if cmd_edge['edgeTargetId'] in literal_nodes:
                #             for i, cmd in enumerate(commands):
                #                 if isinstance(cmd, SetSemanticTypeCmd) and cmd.type == edge_lbl and cmd.node_id == source_id:
                #                         delete_cmds.append(i)
                #         else:
                #             target_id = ont.simplify_uri(cmd_edge['edgeTargetId'])
                #             for i, cmd in enumerate(commands):
                #                 if isinstance(cmd, SetInternalLinkCmd) and cmd.link_lbl == edge_lbl and cmd.target_id == target_id and cmd.source_id == source_id:
                #                     delete_cmds.append(i)
                #
                #     for idx in sorted(delete_cmds, reverse=True):
                #         commands.pop(idx)

                for cmd_edge in cmd_edges['value']:
                    source_uri = cmd_edge.get('edgeSourceUri', None)
                    target_uri = cmd_edge.get('edgeTargetUri', None)

                    if source_uri is not None and source_uri != cmd_edge[
                            'edgeSourceId']:
                        source_uri = ont.simplify_uri(source_uri)
                    else:
                        source_uri = None

                    if target_uri is not None and target_uri != cmd_edge[
                            'edgeTargetId']:
                        target_uri = ont.simplify_uri(target_uri)
                    else:
                        target_uri = None

                    if cmd_edge['edgeTargetId'] in literal_nodes:
                        # convert this command to SetSemanticType
                        commands.append(
                            SetSemanticTypeCmd(
                                literal_nodes[cmd_edge['edgeTargetId']],
                                domain=ont.simplify_uri(source_uri),
                                type=ont.simplify_uri(cmd_edge['edgeId']),
                                node_id=ont.simplify_uri(
                                    cmd_edge['edgeSourceId'])))
                    else:
                        commands.append(
                            SetInternalLinkCmd(
                                ont.simplify_uri(cmd_edge['edgeSourceId']),
                                ont.simplify_uri(cmd_edge['edgeTargetId']),
                                ont.simplify_uri(cmd_edge['edgeId']),
                                source_uri, target_uri))
            elif command['commandName'] == "AddLinkCommand":
                cmd_edges = command['inputParameters'][-3]
                assert cmd_edges['name'] == 'edge'
                cmd_edge = cmd_edges['value']
                source_uri = cmd_edge.get('edgeSourceUri', None)
                target_uri = cmd_edge.get('edgeTargetUri', None)
                if source_uri is not None:
                    source_uri = ont.simplify_uri(source_uri)
                else:
                    source_uri = None

                if cmd_edge['edgeTargetId'] in literal_nodes:
                    # convert this command to SetSemanticType
                    commands.append(
                        SetSemanticTypeCmd(
                            literal_nodes[cmd_edge['edgeTargetId']],
                            domain=ont.simplify_uri(source_uri),
                            type=ont.simplify_uri(cmd_edge['edgeId']),
                            node_id=ont.simplify_uri(
                                cmd_edge['edgeSourceId'])))
                else:
                    if target_uri is not None:
                        target_uri = ont.simplify_uri(target_uri)
                    else:
                        target_uri = None

                    commands.append(
                        SetInternalLinkCmd(
                            ont.simplify_uri(cmd_edge['edgeSourceId']),
                            ont.simplify_uri(cmd_edge['edgeTargetId']),
                            ont.simplify_uri(cmd_edge['edgeId']), source_uri,
                            target_uri))
            elif command['commandName'] == 'DeleteLinkCommand':
                cmd_edge = command['inputParameters'][-3]
                assert cmd_edge['name'] == 'edge'
                cmd_edge = cmd_edge['value']
                for i, cmd in enumerate(commands):
                    if isinstance(cmd, SetInternalLinkCmd):
                        if cmd.source_id == cmd_edge[
                                'edgeSourceId'] and cmd.target_id == cmd_edge[
                                    'edgeTargetId'] and cmd.link_lbl == ont.simplify_uri(
                                        cmd_edge['edgeId']):
                            commands.pop(i)
                            break
            elif command["commandName"] == "AddLiteralNodeCommand":
                cmd_literal_value = command["inputParameters"][0]
                assert cmd_literal_value['name'] == 'literalValue'
                cmd_literal_value = cmd_literal_value['value']

                # they may re-use literal_values, let's user fix it manually
                if cmd_literal_value.startswith("http"):
                    new_attr_path = f"literal:{ont.simplify_uri(cmd_literal_value)}"
                else:
                    new_attr_path = f"literal:{cmd_literal_value}"

                if cmd_literal_value + "1" not in literal_nodes:
                    new_attr_path += ":1"
                    literal_nodes[cmd_literal_value + "1"] = new_attr_path
                elif cmd_literal_value + "2" not in literal_nodes:
                    new_attr_path += ":2"
                    literal_nodes[cmd_literal_value + "2"] = new_attr_path
                elif cmd_literal_value + "3" not in literal_nodes:
                    new_attr_path += ":3"
                    literal_nodes[cmd_literal_value + "3"] = new_attr_path
                else:
                    assert False

                col2col[new_attr_path] = new_attr_path
                commands.append(
                    AddLiteralColumnCmd(new_attr_path, cmd_literal_value))
            elif command["commandName"] == "OperateSelectionCommand":
                # no way to see it in the KARMA UI
                continue
            elif command["commandName"] == "OrganizeColumnsCommand":
                continue
            elif command["commandName"] == "SetWorksheetPropertiesCommand":
                # this command doesn't affect the model
                continue
            # elif command["commandName"] == "UnfoldCommand":
            #     cmd_input_col = command["inputParameters"][-2]
            #     cmd_output_col = command["inputParameters"][-1]
            #     assert cmd_input_col['name'] == "inputColumns" and cmd_output_col['name'] == 'outputColumns'
            #     cmd_input_cols = [
            #         [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_input_col['value'])
            #     ]
            #     input_attr_paths = [col2col[Schema.PATH_DELIMITER.join(cmd_input_col)] for cmd_input_col in cmd_input_cols]
            #     cmd_output_cols = [
            #         [cname['columnName'] for cname in o['value']] for o in ujson.loads(cmd_output_col['value'])
            #     ]
            #
            #     output_attr_paths = []
            #     # update columns mapping
            #     for cmd_output_col in cmd_output_cols:
            #         attr_path = Schema.PATH_DELIMITER.join(cmd_output_col[start_idx:])
            #         col2col[Schema.PATH_DELIMITER.join(cmd_output_col)] = attr_path
            #         output_attr_paths.append(attr_path)
            #
            #     commands.append(UnrollCmd(input_attr_paths, output_attr_paths))
            # elif command["commandName"] == "GlueCommand":
            #     cmd_input_col = command["inputParameters"][-2]
            #     cmd_output_col = command["inputParameters"][-1]
            else:
                assert False, "Source: %s. Doesn't handle command %s" % (
                    tbl.id, command["commandName"])

        # fixing conflict modeling command
        conflicts = defaultdict(lambda: [])
        for i, cmd in enumerate(commands):
            if isinstance(cmd, SetSemanticTypeCmd):
                conflicts[cmd.input_attr_path].append((i, cmd))
            if isinstance(cmd, SetInternalLinkCmd):
                conflicts[(cmd.source_id, cmd.target_id)].append((i, cmd))

        delete_commands = []
        for cmds in conflicts.values():
            if len(cmds) > 1:
                display_warn = False
                for idx, cmd in cmds[1:]:
                    if cmd != cmds[0][1]:
                        if not display_warn:
                            display_warn = True
                            KR2RML.logger.warning(
                                "Table: %s. Conflict between command: \n\t+ %s \n\t+ %s",
                                tbl.id, cmds[0][1], cmd)
                        else:
                            print("\t+", cmd)

                # only keep final commands
                for idx, cmd in cmds[:-1]:
                    delete_commands.append(idx)

                if isinstance(cmds[0][1], SetInternalLinkCmd):
                    # need to update source_uri & target_uri first (for duplicate commands, source_uri, target_uri = None)
                    key = (cmds[-1][1].source_id, cmds[-1][1].link_lbl,
                           cmds[-1][1].target_id)
                    for idx, cmd in cmds[:-1]:
                        if (cmd.source_id, cmd.link_lbl, cmd.target_id) == key:
                            cmds[-1][1].source_uri = cmd.source_uri
                            cmds[-1][1].target_uri = cmd.target_uri
                            break

        delete_commands.sort(reverse=True)
        for idx in delete_commands:
            commands.pop(idx)

        super().__init__(commands)
Esempio n. 12
0
            semantic_models = []
            tables = []
            for i, raw_tbl in enumerate(raw_tables):
                r2rml_file = mapping_dir / f"{raw_tbl.id}-model.yml"
                tbl, sm = R2RML.load_from_file(r2rml_file).apply_build(raw_tbl)
                semantic_models.append(sm)
                tables.append(tbl)

            serializeJSON(semantic_models, cache_file)
            _data_io_vars["data_tables"][dataset] = tables

        _data_io_vars["semantic_models"][dataset] = semantic_models

    return _data_io_vars["semantic_models"][dataset]


if __name__ == '__main__':
    dataset = 'museum_crm'
    ont = Ontology.from_dataset(dataset)

    data_dir = Path(config.datasets[dataset].as_path())
    (data_dir / "models-viz").mkdir(exist_ok=True, parents=True)
    (data_dir / "tables-viz").mkdir(exist_ok=True, parents=True)

    for sm in get_semantic_models(dataset):
        sm.graph.render2pdf(data_dir / f"models-viz/{sm.id}.pdf")

    for tbl in get_data_tables(dataset):
        with open(data_dir / "tables-viz" / f"{tbl.id}.txt", "wb") as f:
            f.write(tbl.to_string().encode("utf-8"))
Esempio n. 13
0
    def to_normalized_json_model(self, ont: Ontology = None) -> dict:
        """Dump the normalized/changed model back to karma model JSON format

        Few changes:
            + All id are converted from int to str so that it's compatible with source_id of link (str type due to split("---")
            + LiteralNodes is converted to ColumnNode (we are going to treat LiteralNode as a column contains only one value)
                and an new column name will be generated for LiteralNodes

        An optional ontology to restore URI from simplified version (e.g: crm:E39_Actor)
        to full version (http://www.cidoc-crm.org/cidoc-crm/E39_Actor)
        """
        nodes = []
        links = []

        if ont is None:
            ont = UselessOntology()

        # add literal nodes to source_columns
        source_columns = [{
            "id": str(col.id),
            "hNodeId": str(col.h_node_id),
            "columnName": col.column_name
        } for col in self.source_columns]
        count = len(self.source_columns)
        for node in self.karma_graph.iter_data_nodes():
            if node.is_literal_node:
                source_columns.append({
                    "id":
                    str(node.id),
                    "hNodeId":
                    str(node.id),
                    "columnName":
                    "A%d__literal_val_%s" %
                    (count, node.label.decode('utf-8').lower().replace(
                        " ", "-"))
                })
                count += 1
        colid2name: Dict[int, str] = {
            int(col['id']): col["columnName"]
            for col in source_columns
        }

        for node in self.karma_graph.iter_nodes():
            onode = {
                "id": str(node.id),
                "modelIds": None,
                "type":
                "InternalNode" if node.is_class_node() else "ColumnNode",
                "label": {
                    "uri": node.label.decode("utf-8")
                }
            }
            if node.is_data_node():
                onode["hNodeId"] = str(node.id)
                onode["columnName"] = colid2name[node.id]
                if node.literal_type is None:
                    onode["rdfLiteralType"] = None
                else:
                    onode["rdfLiteralType"] = {"uri": node.literal_type}

                if node.is_literal_node:
                    parent_link = node.get_first_incoming_link()
                    onode["userSemanticTypes"] = [{
                        "hNodeId": str(node.id),
                        "domain": {
                            "uri":
                            ont.full_uri(
                                parent_link.get_source_node().label.decode(
                                    "utf-8")),
                            "rdfsLabel":
                            None
                        },
                        "type": {
                            "uri":
                            ont.full_uri(parent_link.label.decode("utf-8")),
                            "rdfsLabel": None
                        },
                        "origin": "User",
                        "confidenceScore": 1.0
                    }]
                    onode["learnedSemanticTypes"] = []
                else:
                    onode["userSemanticTypes"] = [{
                        "hNodeId":
                        str(node.id),
                        "domain": {
                            "uri": ont.full_uri(st.domain),
                            "rdfsLabel": None
                        },
                        "type": {
                            "uri": ont.full_uri(st.type),
                            "rdfsLabel": None
                        },
                        "origin":
                        st.origin,
                        "confidenceScore":
                        st.confidence_score
                    } for st in node.user_semantic_types]
                    onode["learnedSemanticTypes"] = [{
                        "hNodeId":
                        str(node.id),
                        "domain": {
                            "uri": ont.full_uri(st.domain),
                            "rdfsLabel": None
                        },
                        "type": {
                            "uri": ont.full_uri(st.type),
                            "rdfsLabel": None
                        },
                        "origin":
                        "AutoModel",
                        "confidenceScore":
                        st.confidence_score
                    } for st in node.learned_semantic_types]
            else:
                onode["label"]["uri"] = ont.full_uri(onode["label"]["uri"])

            nodes.append(onode)

        for link in self.karma_graph.iter_links():
            if link.type == GraphLinkType.OBJECT_PROPERTY:
                link_type = 'ObjectPropertyLink'
            elif link.type == GraphLinkType.DATA_PROPERTY:
                link_type = 'DataPropertyLink'
            elif link.label == 'karma:dev':
                link_type = 'ClassInstanceLink'
            elif link.get_target_node().is_data_node():
                link_type = "DataPropertyLink"
            elif link.get_target_node().is_class_node():
                link_type = "ObjectPropertyLink"

            olink = {
                "id":
                "%s---%s---%s" %
                (link.source_id, link.label.decode("utf-8"), link.target_id),
                "weight":
                None,
                "type":
                link_type,
                "label": {
                    "uri": ont.full_uri(link.label.decode("utf-8"))
                },
                "objectPropertyType":
                "Indirect",
                "status":
                "Normal",
                "keyInfo":
                "None",
                "modelIds":
                None
            }
            links.append(olink)

        model_json = {
            "id":
            self.id,
            "name":
            self.id,
            "description":
            self.description,
            "sourceColumns":
            source_columns,
            "mappingToSourceColumns": [{
                "id": col["id"],
                "sourceColumnId": col["id"]
            } for col in source_columns],
            "graph": {
                "nodes": nodes,
                "links": links
            }
        }
        return model_json