def to_graph(self) -> Graph: g = Graph( index_node_type=True, index_node_label=True, index_link_label=True, estimated_n_nodes=self.get_n_nodes(), estimated_n_links=self.get_n_links(), name=self.name) for n in self.iter_nodes(): g.add_new_node(n.type, n.label) for e in self.iter_links(): g.add_new_link(e.type, e.label, e.source_id, e.target_id) return g
def from_json(obj: dict, ont: Ontology) -> 'SSD': g = Graph(True, True, True) node2attr = {x['node']: x['attribute'] for x in obj['mappings']} idmap = {} raw_attributes = {} for raw_attr in obj['attributes']: assert len(raw_attr['columnIds'] ) == 1 and raw_attr['columnIds'][0] == raw_attr['id'] raw_attributes[raw_attr['id']] = raw_attr attrs = [] for n in obj['semanticModel']['nodes']: if n['type'] == 'DataNode': node_type = GraphNodeType.DATA_NODE attr = raw_attributes[node2attr[n['id']]] n_lbl = attr['name'] attrs.append(SSDAttribute(n['id'], n_lbl)) else: node_type = GraphNodeType.CLASS_NODE n_lbl = n['prefix'] + n['label'] n_lbl = ont.simplify_uri(n_lbl) idmap[n['id']] = g.add_new_node(node_type, n_lbl.encode()).id for e in obj['semanticModel']['links']: e_lbl = e['prefix'] + e['label'] e_lbl = ont.simplify_uri(e_lbl) g.add_new_link(GraphLinkType.UNSPECIFIED, e_lbl.encode(), idmap[e['source']], idmap[e['target']]) return SSD(obj['name'], attrs, g, ont)
def make_ssd(sm: SemanticModel, keys: Set[str], ont: Ontology) -> SSD: attrs = {} for attr in sm.attrs: # new_lbl = attr.label.replace(Schema.PATH_DELIMITER, ".") new_lbl = attr.label attrs[attr.id] = SSDAttribute(attr.id, new_lbl) assert new_lbl in keys g = Graph() for n in sm.graph.iter_nodes(): if n.is_data_node(): label = attrs[n.id].name.encode() else: label = n.label g.add_new_node(n.type, label) for e in sm.graph.iter_links(): g.add_new_link(e.type, e.label, e.source_id, e.target_id) return SSD(sm.id, list(attrs.values()), g, ont)
def mask_dnode(self, g: Graph) -> Graph: """deprecated""" g2 = Graph(True, True, True, g.get_n_nodes(), g.get_n_links()) for n in g.iter_nodes(): assert g2.add_new_node( n.type, n.label if n.type == GraphNodeType.CLASS_NODE else b"DataNode").id == n.id for e in g.iter_links(): assert g2.add_new_link(e.type, e.label, e.source_id, e.target_id).id == e.id return g2
def clear_serene_footprint(self, remove_unknown: bool = True) -> 'SSD': g = Graph(True, True, True) idmap = {} serene_all = None serene_unknown = None for n in self.graph.iter_nodes(): if n.label == b"serene:All": serene_all = n continue if n.label == b"serene:Unknown": serene_unknown = n continue ignore_nodes = set() if serene_all is not None: ignore_nodes.add(serene_all.id) if remove_unknown and serene_unknown is not None: ignore_nodes.add(serene_unknown.id) for e in self.graph.iter_links(): if e.source_id == serene_unknown.id: assert e.get_target_node().is_data_node() ignore_nodes.add(e.target_id) if len(ignore_nodes) == 0: # no serene footprint to remove return self for n in self.graph.iter_nodes(): if n.id in ignore_nodes: continue idmap[n.id] = g.add_new_node(n.type, n.label).id for e in self.graph.iter_links(): if e.label == b"serene:connect": continue if remove_unknown and e.label == b"serene:unknown": continue g.add_new_link(e.type, e.label, idmap[e.source_id], idmap[e.target_id]) self.graph = g return self
def apply_cmds(self, tbl: DataTable) -> SemanticModel: g = Graph(index_node_type=True, index_node_label=True, index_link_label=True, name=tbl.id.encode("utf-8")) attrs: List[Attribute] = [] id_map: Dict[str, int] = {} for cmd in self.commands: if isinstance(cmd, PyTransformNewColumnCmd): # TODO: fix me! currently the new attr_path is generated from first input_attr_path # we should be explicitly about the output, since the first input attr path can be different # may be it should be the deepest attr path new_attr_path = Schema.PATH_DELIMITER.join( cmd.input_attr_paths[0].split(Schema.PATH_DELIMITER)[:-1] + [cmd.new_attr_name]) # assert not tbl.schema.has_attr_path(new_attr_path) # TODO: fix me!! not handle list of input attr path properly (cmd.input_attr_paths[0]) tbl.schema.add_new_attr_path( new_attr_path, tbl.schema.get_attr_type(cmd.input_attr_paths[0]), cmd.input_attr_paths[-1]) self.pytransform(tbl, cmd) elif isinstance(cmd, SetSemanticTypeCmd): lbl = cmd.input_attr_path.encode("utf-8") assert cmd.input_attr_path not in id_map id_map[cmd.input_attr_path] = g.add_new_node( GraphNodeType.DATA_NODE, lbl).id if cmd.node_id not in id_map: id_map[cmd.node_id] = g.add_new_node( GraphNodeType.CLASS_NODE, cmd.domain.encode("utf-8")).id attrs.append( Attribute(id_map[cmd.input_attr_path], cmd.input_attr_path, [])) g.add_new_link(GraphLinkType.UNSPECIFIED, cmd.type.encode("utf-8"), id_map[cmd.node_id], id_map[cmd.input_attr_path]) elif isinstance(cmd, SetInternalLinkCmd): if cmd.source_id not in id_map: id_map[cmd.source_id] = g.add_new_node( GraphNodeType.CLASS_NODE, cmd.source_uri.encode('utf-8')).id if cmd.target_id not in id_map: id_map[cmd.target_id] = g.add_new_node( GraphNodeType.CLASS_NODE, cmd.target_uri.encode('utf-8')).id assert g.get_node_by_id( id_map[cmd.target_id]).n_incoming_links == 0 g.add_new_link(GraphLinkType.UNSPECIFIED, cmd.link_lbl.encode("utf-8"), id_map[cmd.source_id], id_map[cmd.target_id]) elif isinstance(cmd, ZipAttributesCmd): for row in tbl.rows: cmd.zip_attributes(row) # TODO: fix me!! re-build schema, which is very expensive tbl.rebuild_schema() elif isinstance(cmd, UnpackOneElementListCmd): assert tbl.schema.get_attr_type( cmd.input_attr) == Schema.LIST_VALUE for row in tbl.rows: cmd.unpack(row) tbl.schema.update_attr_path(cmd.input_attr, Schema.SINGLE_VALUE) elif isinstance(cmd, AddLiteralColumnCmd): tbl.schema.add_new_attr_path(cmd.input_attr_path, tbl.schema.SINGLE_VALUE) for row in tbl.rows: cmd.add_literal(row) elif isinstance(cmd, JoinListCmd): for row in tbl.rows: cmd.execute(row) tbl.schema.update_attr_path(cmd.input_attr_path, Schema.SINGLE_VALUE) else: raise NotImplementedError(cmd.__class__.__name__) return SemanticModel(tbl.id, attrs, g)