def schema_neighbors(self, field: (str, str, str)) -> DRS: """ Returns all the other attributes/fields that appear in the same relation than the provided field :param field: the provided field :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ db_name, source_name, field_name = field hits = self.__network.get_hits_from_table(source_name) origin_hit = Hit(id_from(db_name, source_name, field_name), db_name, source_name, field_name, 0) o_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[origin_hit])) return o_drs
def test_union(self): print(self._testMethodName) # DRS 1 h0 = Hit(10, "dba", "table_c", "v", -1) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN)) # DRS 2 h5 = Hit(1, "dba", "table_a", "b", -1) h6 = Hit(16, "dba", "table_d", "a", -1) h7 = Hit(17, "dba", "table_d", "b", -1) drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN)) drs = drs1.union(drs2) prov_graph = drs.get_provenance().prov_graph() nodes = prov_graph.nodes() print("NODES") for n in nodes: print(str(n)) print(" ") edges = prov_graph.edges(keys=True) print("EDGES") for e in edges: print(str(e)) print(" ") data = [x for x in drs] ld = len(data) print("Len must be 7: " + str(ld)) self.assertTrue(ld == 7)
def entity_search(self, kw: str, max_results=10) -> DRS: """ Performs a keyword search over the entities represented by the data :param kw: the keyword to search :param max_results: the maximum number of results to return :return: returns a list of Hit elements of the form (id, source_name, field_name, score) """ hits = store_client.search_keywords(kw, KWType.KW_ENTITIES, max_results) drs = DRS([x for x in hits], Operation(OP.ENTITY_LOOKUP, params=[kw])) # materialize generator return drs
def traverse(self, a: DRS, primitives, max_hops) -> DRS: o_drs = DRS([], Operation(OP.NONE)) if a.mode == DRSMode.TABLE: print("ERROR: input mode TABLE not supported") return [] fringe = [x for x in a] o_drs.absorb_provenance(a) while max_hops > 0: max_hops = max_hops - 1 for h in fringe: hits_drs = self.__network.neighbors_id(h, primitives) o_drs = self.union(o_drs, hits_drs) fringe = [x for x in o_drs] # grow the initial input return o_drs
def test_drs_table_iteration(self): print(self._testMethodName) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN)) drs.set_table_mode() for el in drs: print(str(el)) self.assertTrue(True)
def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] score = 1.0 # TODO: return more meaningful score results for hit in md_neighbors: k = hit.target if hit.target != nid else hit.source (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def get_table_neighbors(hit, relation, paths): results = [] direct_neighbors = self.neighbors_id(hit, relation) # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name] op = self.get_op_from_relation(relation) direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit])) # FIXME: filter out already seen nodes here for n in direct_neighbors: if not check_membership(n, paths): t_neighbors = api.drs_from_table_hit(n) results.extend([(x, n) for x in t_neighbors]) return results # note how we include hit as sibling of x here
def paths(self, drs_a: DRS, drs_b: DRS, relation=Relation.PKFK, max_hops=2) -> DRS: """ Is there a transitive relationship between any element in a with any element in b? This function finds the answer constrained on the primitive (singular for now) that is passed as a parameter. If b is not passed, assumes the user is searching for paths between elements in a. :param a: DRS :param b: DRS :param Relation: Relation :return: """ # create b if it wasn't passed in. drs_a = self._general_to_drs(drs_a) drs_b = self._general_to_drs(drs_b) self._assert_same_mode(drs_a, drs_b) # absorb the provenance of both a and b o_drs = DRS([], Operation(OP.NONE)) o_drs.absorb_provenance(drs_a) if drs_b != drs_a: o_drs.absorb_provenance(drs_b) for h1, h2 in itertools.product(drs_a, drs_b): # there are different network operations for table and field mode res_drs = None if drs_a.mode == DRSMode.FIELDS: res_drs = self._network.find_path_hit(h1, h2, relation, max_hops=max_hops) else: res_drs = self._network.find_path_table(h1, h2, relation, self, max_hops=max_hops) o_drs = o_drs.absorb(res_drs) return o_drs
def neighbors_id(self, hit: Hit, relation: Relation) -> DRS: if isinstance(hit, Hit): nid = str(hit.nid) if isinstance(hit, str): nid = hit nid = str(nid) data = [] neighbours = self.__G[nid] for k, v in neighbours.items(): if relation in v: score = v[relation]['score'] (db_name, source_name, field_name, data_type) = self.__id_names[k] data.append(Hit(k, db_name, source_name, field_name, score)) op = self.get_op_from_relation(relation) o_drs = DRS(data, Operation(op, params=[hit])) return o_drs
def similar_content_to(self, i_drs: DRS) -> DRS: """ Given a DRS it returns another DRS that contains all fields similar to the fields of the input :param i_drs: the input DRS :return: DRS """ o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) if i_drs.mode == DRSMode.TABLE: i_drs.set_fields_mode() for h in i_drs: fields_table = self.drs_from_table_hit(h) i_drs = i_drs.absorb(fields_table) for h in i_drs: hits_drs = self.__network.neighbors_id(h, Relation.CONTENT_SIM) o_drs = o_drs.absorb(hits_drs) return o_drs
def search(self, kw: str, kw_type: KWType, max_results=10) -> DRS: """ Performs a keyword search over the contents of the data. Scope specifies where elasticsearch should be looking for matches. i.e. table titles (SOURCE), columns (FIELD), or comment (SOURCE) :param kw: the keyword to serch :param kw_type: the context type on which to search :param max_results: maximum number of results to return :return: returns a DRS """ hits = self._store_client.search_keywords( keywords=kw, elasticfieldname=kw_type, max_hits=max_results) # materialize generator drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw])) return drs
def pkfk_of(self, i_drs: DRS) -> DRS: """ Given a DRS it returns another DRS that contains all fields similar to the fields of the input :param i_drs: the input DRS :return: DRS """ # alternative provenance propagation o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(i_drs) if i_drs.mode == DRSMode.TABLE: i_drs.set_fields_mode() for h in i_drs: fields_table = self.drs_from_table_hit(h) i_drs = i_drs.absorb(fields_table) # o_drs.extend_provenance(fields_drs) for h in i_drs: hits_drs = self.__network.neighbors_id(h, Relation.PKFK) o_drs = o_drs.absorb(hits_drs) # o_drs.extend_provenance(i_drs) return o_drs
def __traverse(self, a: DRS, primitive, max_hops=2) -> DRS: """ Conduct a breadth first search of nodes matching a primitive, starting with an initial DRS. :param a: a nid, node, tuple, or DRS :param primitive: The element to search :max_hops: maximum number of rounds on the graph """ a = self._general_to_drs(a) o_drs = DRS([], Operation(OP.NONE)) if a.mode == DRSMode.TABLE: raise ValueError('input mode DRSMode.TABLE not supported') fringe = a o_drs.absorb_provenance(a) while max_hops > 0: max_hops = max_hops - 1 for h in fringe: hits_drs = self._network.neighbors_id(h, primitive) o_drs = self.union(o_drs, hits_drs) fringe = o_drs # grow the initial input return o_drs
def paths_between(self, a: DRS, b: DRS, primitives, max_hops=2) -> DRS: """ Is there a transitive relationship between any element in a with any element in b? This functions finds the answer constrained on the primitive (singular for now) that is passed as a parameter. :param a: :param b: :param primitives: :return: """ assert (a.mode == b.mode) o_drs = DRS([], Operation(OP.NONE)) o_drs.absorb_provenance(a) o_drs.absorb_provenance(b) if a.mode == DRSMode.FIELDS: for h1 in a: # h1 is a Hit for h2 in b: # h2 is a Hit if h1 == h2: return o_drs # same source and target field res_drs = self.__network.find_path_hit(h1, h2, primitives, max_hops=max_hops) o_drs = o_drs.absorb(res_drs) elif a.mode == DRSMode.TABLE: for h1 in a: # h1 is a table: str for h2 in b: # h2 is a table: str if h1 == h2: return o_drs # same source ant target table res_drs = self.__network.find_path_table(h1, h2, primitives, self, max_hops=max_hops) o_drs = o_drs.absorb(res_drs) return o_drs
def test_creation_initial_provenance(self): print(self._testMethodName) h0 = Hit(10, "dba", "table_c", "v", -1) h1 = Hit(0, "dba", "table_a", "a", -1) h2 = Hit(1, "dba", "table_a", "b", -1) h3 = Hit(2, "dba", "table_b", "c", -1) h4 = Hit(3, "dba", "table_b", "d", -1) drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0])) prov_graph = drs.get_provenance().prov_graph() nodes = prov_graph.nodes() print("NODES") for n in nodes: print(str(n)) print(" ") edges = prov_graph.edges(keys=True) print("EDGES") for e in edges: print(str(e)) print(" ") self.assertTrue(True)
def paths(self, a: DRS, primitives) -> DRS: """ Is there any transitive relationship between any two elements in a? This function finds the answer constrained on the primitive (singular for now) passed as parameter :param a: :param primitives: :return: """ o_drs = DRS([], Operation(OP.NONE)) o_drs = o_drs.absorb_provenance(a) if a.mode == DRSMode.FIELDS: for h1 in a: # h1 is a Hit for h2 in a: # h2 is a Hit if h1 == h2: continue res_drs = self.__network.find_path_hit(h1, h2, primitives) o_drs = o_drs.absorb(res_drs) elif a.mode == DRSMode.TABLE: for h1 in a: # h1 is a table: str for h2 in a: # h2 is a table: str res_drs = self.__network.find_path_table( h1, h2, primitives, self) o_drs = o_drs.absorb(res_drs) return o_drs
def find_path_table(self, source: str, target: str, relation, api, max_hops=3): def assemble_table_path_provenance(o_drs, paths, relation): for path in paths: src, src_sibling = path[0] assert (src_sibling is None) # sibling of source should be None, as source is an origin tgt, tgt_sibling = path[-1] origin = DRS([src], Operation(OP.ORIGIN)) o_drs.absorb_provenance(origin) prev_c = src for c, sibling in path[1:-1]: nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c])) o_drs.absorb_provenance(nxt) if c.nid != sibling.nid: # avoid loop on head nodes of the graph linker = DRS([c], Operation(OP.TABLE, params=[sibling])) o_drs.absorb_provenance(linker) prev_c = c sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c])) if tgt.nid != tgt_sibling.nid: o_drs = o_drs.absorb_provenance(sink) linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling])) o_drs.absorb(linker) else: o_drs = o_drs.absorb(sink) return o_drs def check_membership(c, paths): for p in paths: for (s, sibling) in p: if c.source_name == s.source_name: return True return False def append_to_paths(paths, c): new_paths = [] for p in paths: new_path = [] new_path.extend(p) new_path.append(c) new_paths.append(new_path) return new_paths def get_table_neighbors(hit, relation, paths): results = [] direct_neighbors = self.neighbors_id(hit, relation) # FIXME: filter out already seen nodes here for n in direct_neighbors: if not check_membership(n, paths): t_neighbors = api.drs_from_table_hit(n) results.extend([(x, n) for x in t_neighbors]) return results # note how we include hit as sibling of x here def dfs_explore(sources, targets, max_hops, paths): # Check if sources have reached targets for (s, sibling) in sources: if s in targets: # Append successful paths to found_paths next_paths = append_to_paths(paths, (s, sibling)) found_paths.extend(next_paths) return True # Check if no more hops are allowed: if max_hops == 0: return False # not found path # Get next set of candidates and keep exploration for (s, sibling) in sources: next_candidates = get_table_neighbors(s, relation, paths) # updated paths to test membership # recursive on new candidates, one fewer hop and updated paths if len(next_candidates) == 0: continue next_paths = append_to_paths(paths, (s, sibling)) dfs_explore(next_candidates, targets, max_hops - 1, next_paths) o_drs = DRS([], Operation(OP.NONE)) # Carrier of provenance # TODO: same src == trg, etc src_drs = api.drs_from_table(source) trg_drs = api.drs_from_table(target) found_paths = [] candidates = [(x, None) for x in src_drs] # tuple carrying candidate and same-table attribute paths = [[]] # to carry partial paths dfs_explore(candidates, [x for x in trg_drs], max_hops, paths) for p in found_paths: print(p) o_drs = assemble_table_path_provenance(o_drs, found_paths, relation) return o_drs
def drs_from_table_hit(self, hit: Hit) -> DRS: # TODO: migrated from old ddapi as there's no good swap table = hit.source_name hits = self._network.get_hits_from_table(table) drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit])) return drs
def find_path_table(self, source: str, target: str, relation, api, max_hops=3): def assemble_table_path_provenance(o_drs, paths, relation): for path in paths: src, src_sibling = path[0] assert (src_sibling is None) # sibling of source should be None, as source is an origin tgt, tgt_sibling = path[-1] origin = DRS([src], Operation(OP.ORIGIN)) o_drs.absorb_provenance(origin) prev_c = src for c, sibling in path[1:-1]: nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c])) o_drs.absorb_provenance(nxt) if c.nid != sibling.nid: # avoid loop on head nodes of the graph linker = DRS([c], Operation(OP.TABLE, params=[sibling])) o_drs.absorb_provenance(linker) prev_c = c sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c])) #The join path at the target has None sibling if tgt is not None and tgt_sibling is not None and tgt.nid != tgt_sibling.nid: o_drs = o_drs.absorb_provenance(sink) linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling])) o_drs.absorb(linker) else: o_drs = o_drs.absorb(sink) return o_drs def check_membership(c, paths): for p in paths: for (s, sibling) in p: if c.source_name == s.source_name: return True return False def append_to_paths(paths, c): new_paths = [] for p in paths: new_path = [] new_path.extend(p) new_path.append(c) new_paths.append(new_path) return new_paths def get_table_neighbors(hit, relation, paths): results = [] direct_neighbors = self.neighbors_id(hit, relation) # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name] op = self.get_op_from_relation(relation) direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit])) # FIXME: filter out already seen nodes here for n in direct_neighbors: if not check_membership(n, paths): t_neighbors = api.drs_from_table_hit(n) # Brought old API # t_neighbors = api.make_drs(n) # XXX: this won't take all table neighbors, only the input one results.extend([(x, n) for x in t_neighbors]) return results # note how we include hit as sibling of x here def dfs_explore(sources, targets, max_hops, paths): # Check if sources have reached targets for (s, sibling) in sources: if s in targets: # Append successful paths to found_paths # T1.A join T2.B, and T2.C may join with other tables T3.D # get_table_neighbors returns next_candidates (s, sibling) (C,B) # in case T2 is the target add to the path (sibling, sibling) # Otherwise (C,B) if s.source_name == targets[0].source_name: next_paths = append_to_paths(paths, (sibling, sibling)) else: next_paths = append_to_paths(paths, (s, sibling)) found_paths.extend(next_paths) return True # Check if no more hops are allowed: if max_hops == 0: return False # not found path # Get next set of candidates and keep exploration for (s, sibling) in sources: next_candidates = get_table_neighbors(s, relation, paths) # updated paths to test membership # recursive on new candidates, one fewer hop and updated paths if len(next_candidates) == 0: continue next_paths = append_to_paths(paths, (s, sibling)) dfs_explore(next_candidates, targets, max_hops - 1, next_paths) o_drs = DRS([], Operation(OP.NONE)) # Carrier of provenance # TODO: same src == trg, etc # src_drs = api.drs_from_table(source) # trg_drs = api.drs_from_table(target) src_drs = api.make_drs(source) trg_drs = api.make_drs(target) found_paths = [] candidates = [(x, None) for x in src_drs] # tuple carrying candidate and same-table attribute paths = [[]] # to carry partial paths dfs_explore(candidates, [x for x in trg_drs], max_hops, paths) # for p in found_paths: # print(p) o_drs = assemble_table_path_provenance(o_drs, found_paths, relation) return o_drs
def find_path_hit(self, source, target, relation, max_hops=5): def assemble_field_path_provenance(o_drs, path, relation): src = path[0] tgt = path[-1] origin = DRS([src], Operation(OP.ORIGIN)) o_drs.absorb_provenance(origin) prev_c = src for c in path[1:-1]: nxt = DRS([c], Operation(OP.PKFK, params=[prev_c])) o_drs.absorb_provenance(nxt) prev_c = c sink = DRS([tgt], Operation(OP.PKFK, params=[prev_c])) o_drs = o_drs.absorb(sink) return o_drs def deep_explore(candidates, target_group, already_visited, path, max_hops): """ Recursively depth-first explore the graph, checking if candidates are in target_group Returns (boolean, []) """ local_max_hops = max_hops if local_max_hops == 0: return False # first check membership for c in candidates: if c in target_group: path.insert(0, c) return True # if not, then we explore these individually for c in candidates: if c in already_visited: continue # next candidate else: already_visited.append(c) # add candidate to set of already visited next_level_candidates = [x for x in self.neighbors_id(c, relation)] # get next set of candidates if len(next_level_candidates) == 0: continue next_max_hops = local_max_hops - 1 # reduce one level depth and go ahead success = deep_explore(next_level_candidates, target_group, already_visited, path, next_max_hops) if success: path.insert(0, c) return True return False # if all nodes were already visited # maximum number of hops max_hops = 5 o_drs = DRS([], Operation(OP.NONE)) # Carrier of provenance # TODO: same src == trg, etc path = [] success = deep_explore([source], [target], [], path, max_hops) if success: o_drs = assemble_field_path_provenance(o_drs, path, relation) return o_drs else: return DRS([], Operation(OP.NONE))
def drs_from_hit(self, hit: Hit) -> DRS: drs = DRS([hit], Operation(OP.ORIGIN)) return drs
def drs_from_hits(self, hits: [Hit]) -> DRS: drs = DRS(hits, Operation(OP.ORIGIN)) return drs
def drs_from_table_hit(self, hit: Hit) -> DRS: table = hit.source_name hits = self.__network.get_hits_from_table(table) drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit])) return drs