예제 #1
0
 def schema_neighbors(self, field: (str, str, str)) -> DRS:
     """
     Returns all the other attributes/fields that appear in the same relation than the provided field
     :param field: the provided field
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     db_name, source_name, field_name = field
     hits = self.__network.get_hits_from_table(source_name)
     origin_hit = Hit(id_from(db_name, source_name, field_name), db_name,
                      source_name, field_name, 0)
     o_drs = DRS([x for x in hits], Operation(OP.TABLE,
                                              params=[origin_hit]))
     return o_drs
예제 #2
0
    def test_union(self):
        print(self._testMethodName)

        # DRS 1
        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs1 = DRS([h0, h1, h2, h3, h4], Operation(OP.ORIGIN))

        # DRS 2
        h5 = Hit(1, "dba", "table_a", "b", -1)

        h6 = Hit(16, "dba", "table_d", "a", -1)
        h7 = Hit(17, "dba", "table_d", "b", -1)
        drs2 = DRS([h5, h6, h7], Operation(OP.ORIGIN))

        drs = drs1.union(drs2)

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        data = [x for x in drs]
        ld = len(data)

        print("Len must be 7: " + str(ld))

        self.assertTrue(ld == 7)
예제 #3
0
 def entity_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the entities represented by the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     hits = store_client.search_keywords(kw, KWType.KW_ENTITIES,
                                         max_results)
     drs = DRS([x for x in hits],
               Operation(OP.ENTITY_LOOKUP,
                         params=[kw]))  # materialize generator
     return drs
예제 #4
0
 def traverse(self, a: DRS, primitives, max_hops) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     if a.mode == DRSMode.TABLE:
         print("ERROR: input mode TABLE not supported")
         return []
     fringe = [x for x in a]
     o_drs.absorb_provenance(a)
     while max_hops > 0:
         max_hops = max_hops - 1
         for h in fringe:
             hits_drs = self.__network.neighbors_id(h, primitives)
             o_drs = self.union(o_drs, hits_drs)
         fringe = [x for x in o_drs]  # grow the initial input
     return o_drs
예제 #5
0
    def test_drs_table_iteration(self):
        print(self._testMethodName)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.ORIGIN))
        drs.set_table_mode()

        for el in drs:
            print(str(el))

        self.assertTrue(True)
예제 #6
0
 def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     score = 1.0 # TODO: return more meaningful score results
     for hit in md_neighbors:
         k = hit.target if hit.target != nid else hit.source
         (db_name, source_name, field_name, data_type) = self.__id_names[k]
         data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
예제 #7
0
        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)

            # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov
            direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name]
            op = self.get_op_from_relation(relation)
            direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit]))

            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    t_neighbors = api.drs_from_table_hit(n)
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here
예제 #8
0
    def paths(self,
              drs_a: DRS,
              drs_b: DRS,
              relation=Relation.PKFK,
              max_hops=2) -> DRS:
        """
        Is there a transitive relationship between any element in a with any
        element in b?
        This function finds the answer constrained on the primitive
        (singular for now) that is passed as a parameter.
        If b is not passed, assumes the user is searching for paths between
        elements in a.
        :param a: DRS
        :param b: DRS
        :param Relation: Relation
        :return:
        """
        # create b if it wasn't passed in.
        drs_a = self._general_to_drs(drs_a)
        drs_b = self._general_to_drs(drs_b)

        self._assert_same_mode(drs_a, drs_b)

        # absorb the provenance of both a and b
        o_drs = DRS([], Operation(OP.NONE))
        o_drs.absorb_provenance(drs_a)
        if drs_b != drs_a:
            o_drs.absorb_provenance(drs_b)

        for h1, h2 in itertools.product(drs_a, drs_b):

            # there are different network operations for table and field mode
            res_drs = None
            if drs_a.mode == DRSMode.FIELDS:
                res_drs = self._network.find_path_hit(h1,
                                                      h2,
                                                      relation,
                                                      max_hops=max_hops)
            else:
                res_drs = self._network.find_path_table(h1,
                                                        h2,
                                                        relation,
                                                        self,
                                                        max_hops=max_hops)

            o_drs = o_drs.absorb(res_drs)

        return o_drs
예제 #9
0
 def neighbors_id(self, hit: Hit, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     neighbours = self.__G[nid]
     for k, v in neighbours.items():
         if relation in v:
             score = v[relation]['score']
             (db_name, source_name, field_name, data_type) = self.__id_names[k]
             data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
예제 #10
0
 def similar_content_to(self, i_drs: DRS) -> DRS:
     """
     Given a DRS it returns another DRS that contains all fields similar to the fields of the input
     :param i_drs: the input DRS
     :return: DRS
     """
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
     for h in i_drs:
         hits_drs = self.__network.neighbors_id(h, Relation.CONTENT_SIM)
         o_drs = o_drs.absorb(hits_drs)
     return o_drs
예제 #11
0
    def search(self, kw: str, kw_type: KWType, max_results=10) -> DRS:
        """
        Performs a keyword search over the contents of the data.
        Scope specifies where elasticsearch should be looking for matches.
        i.e. table titles (SOURCE), columns (FIELD), or comment (SOURCE)

        :param kw: the keyword to serch
        :param kw_type: the context type on which to search
        :param max_results: maximum number of results to return
        :return: returns a DRS
        """

        hits = self._store_client.search_keywords(
            keywords=kw, elasticfieldname=kw_type, max_hits=max_results)

        # materialize generator
        drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw]))
        return drs
예제 #12
0
 def pkfk_of(self, i_drs: DRS) -> DRS:
     """
     Given a DRS it returns another DRS that contains all fields similar to the fields of the input
     :param i_drs: the input DRS
     :return: DRS
     """
     # alternative provenance propagation
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
             # o_drs.extend_provenance(fields_drs)
     for h in i_drs:
         hits_drs = self.__network.neighbors_id(h, Relation.PKFK)
         o_drs = o_drs.absorb(hits_drs)
     # o_drs.extend_provenance(i_drs)
     return o_drs
예제 #13
0
    def __traverse(self, a: DRS, primitive, max_hops=2) -> DRS:
        """
        Conduct a breadth first search of nodes matching a primitive, starting
        with an initial DRS.
        :param a: a nid, node, tuple, or DRS
        :param primitive: The element to search
        :max_hops: maximum number of rounds on the graph
        """
        a = self._general_to_drs(a)

        o_drs = DRS([], Operation(OP.NONE))

        if a.mode == DRSMode.TABLE:
            raise ValueError('input mode DRSMode.TABLE not supported')

        fringe = a
        o_drs.absorb_provenance(a)
        while max_hops > 0:
            max_hops = max_hops - 1
            for h in fringe:
                hits_drs = self._network.neighbors_id(h, primitive)
                o_drs = self.union(o_drs, hits_drs)
            fringe = o_drs  # grow the initial input
        return o_drs
예제 #14
0
 def paths_between(self, a: DRS, b: DRS, primitives, max_hops=2) -> DRS:
     """
     Is there a transitive relationship between any element in a with any element in b?
     This functions finds the answer constrained on the primitive (singular for now) that is passed
     as a parameter.
     :param a:
     :param b:
     :param primitives:
     :return:
     """
     assert (a.mode == b.mode)
     o_drs = DRS([], Operation(OP.NONE))
     o_drs.absorb_provenance(a)
     o_drs.absorb_provenance(b)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in b:  # h2 is a Hit
                 if h1 == h2:
                     return o_drs  # same source and target field
                 res_drs = self.__network.find_path_hit(h1,
                                                        h2,
                                                        primitives,
                                                        max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in b:  # h2 is a table: str
                 if h1 == h2:
                     return o_drs  # same source ant target table
                 res_drs = self.__network.find_path_table(h1,
                                                          h2,
                                                          primitives,
                                                          self,
                                                          max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
예제 #15
0
    def test_creation_initial_provenance(self):
        print(self._testMethodName)

        h0 = Hit(10, "dba", "table_c", "v", -1)

        h1 = Hit(0, "dba", "table_a", "a", -1)
        h2 = Hit(1, "dba", "table_a", "b", -1)
        h3 = Hit(2, "dba", "table_b", "c", -1)
        h4 = Hit(3, "dba", "table_b", "d", -1)
        drs = DRS([h1, h2, h3, h4], Operation(OP.CONTENT_SIM, params=[h0]))

        prov_graph = drs.get_provenance().prov_graph()
        nodes = prov_graph.nodes()
        print("NODES")
        for n in nodes:
            print(str(n))
        print(" ")
        edges = prov_graph.edges(keys=True)
        print("EDGES")
        for e in edges:
            print(str(e))
        print(" ")

        self.assertTrue(True)
예제 #16
0
 def paths(self, a: DRS, primitives) -> DRS:
     """
     Is there any transitive relationship between any two elements in a?
     This function finds the answer constrained on the primitive (singular for now) passed as parameter
     :param a:
     :param primitives:
     :return:
     """
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(a)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in a:  # h2 is a Hit
                 if h1 == h2:
                     continue
                 res_drs = self.__network.find_path_hit(h1, h2, primitives)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in a:  # h2 is a table: str
                 res_drs = self.__network.find_path_table(
                     h1, h2, primitives, self)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
예제 #17
0
    def find_path_table(self, source: str, target: str, relation, api, max_hops=3):

        def assemble_table_path_provenance(o_drs, paths, relation):

            for path in paths:
                src, src_sibling = path[0]
                assert (src_sibling is None)  # sibling of source should be None, as source is an origin
                tgt, tgt_sibling = path[-1]
                origin = DRS([src], Operation(OP.ORIGIN))
                o_drs.absorb_provenance(origin)
                prev_c = src
                for c, sibling in path[1:-1]:
                    nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c]))
                    o_drs.absorb_provenance(nxt)
                    if c.nid != sibling.nid:  # avoid loop on head nodes of the graph
                        linker = DRS([c], Operation(OP.TABLE, params=[sibling]))
                        o_drs.absorb_provenance(linker)
                    prev_c = c
                sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c]))

                if tgt.nid != tgt_sibling.nid:
                    o_drs = o_drs.absorb_provenance(sink)
                    linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling]))
                    o_drs.absorb(linker)
                else:
                    o_drs = o_drs.absorb(sink)
            return o_drs

        def check_membership(c, paths):
            for p in paths:
                for (s, sibling) in p:
                    if c.source_name == s.source_name:
                        return True
            return False

        def append_to_paths(paths, c):
            new_paths = []
            for p in paths:
                new_path = []
                new_path.extend(p)
                new_path.append(c)
                new_paths.append(new_path)
            return new_paths

        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)
            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    t_neighbors = api.drs_from_table_hit(n)
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here

        def dfs_explore(sources, targets, max_hops, paths):

            # Check if sources have reached targets
            for (s, sibling) in sources:
                if s in targets:
                    # Append successful paths to found_paths
                    next_paths = append_to_paths(paths, (s, sibling))
                    found_paths.extend(next_paths)
                    return True

            # Check if no more hops are allowed:
            if max_hops == 0:
                return False  # not found path

            # Get next set of candidates and keep exploration
            for (s, sibling) in sources:
                next_candidates = get_table_neighbors(s, relation, paths)  # updated paths to test membership
                # recursive on new candidates, one fewer hop and updated paths
                if len(next_candidates) == 0:
                    continue
                next_paths = append_to_paths(paths, (s, sibling))
                dfs_explore(next_candidates, targets, max_hops - 1, next_paths)

        o_drs = DRS([], Operation(OP.NONE))  # Carrier of provenance

        # TODO: same src == trg, etc

        src_drs = api.drs_from_table(source)
        trg_drs = api.drs_from_table(target)

        found_paths = []
        candidates = [(x, None) for x in src_drs]  # tuple carrying candidate and same-table attribute

        paths = [[]]  # to carry partial paths

        dfs_explore(candidates, [x for x in trg_drs], max_hops, paths)

        for p in found_paths:
            print(p)

        o_drs = assemble_table_path_provenance(o_drs, found_paths, relation)

        return o_drs
예제 #18
0
 def drs_from_table_hit(self, hit: Hit) -> DRS:
     # TODO: migrated from old ddapi as there's no good swap
     table = hit.source_name
     hits = self._network.get_hits_from_table(table)
     drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit]))
     return drs
예제 #19
0
    def find_path_table(self, source: str, target: str, relation, api, max_hops=3):

        def assemble_table_path_provenance(o_drs, paths, relation):

            for path in paths:
                src, src_sibling = path[0]
                assert (src_sibling is None)  # sibling of source should be None, as source is an origin
                tgt, tgt_sibling = path[-1]
                origin = DRS([src], Operation(OP.ORIGIN))
                o_drs.absorb_provenance(origin)
                prev_c = src
                for c, sibling in path[1:-1]:
                    nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c]))
                    o_drs.absorb_provenance(nxt)
                    if c.nid != sibling.nid:  # avoid loop on head nodes of the graph
                        linker = DRS([c], Operation(OP.TABLE, params=[sibling]))
                        o_drs.absorb_provenance(linker)
                    prev_c = c
                sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c]))

                #The join path at the target has None sibling
                if tgt is not None and tgt_sibling is not None and tgt.nid != tgt_sibling.nid:
                    o_drs = o_drs.absorb_provenance(sink)
                    linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling]))
                    o_drs.absorb(linker)
                else:
                    o_drs = o_drs.absorb(sink)
            return o_drs

        def check_membership(c, paths):
            for p in paths:
                for (s, sibling) in p:
                    if c.source_name == s.source_name:
                        return True
            return False

        def append_to_paths(paths, c):
            new_paths = []
            for p in paths:
                new_path = []
                new_path.extend(p)
                new_path.append(c)
                new_paths.append(new_path)
            return new_paths

        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)

            # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov
            direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name]
            op = self.get_op_from_relation(relation)
            direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit]))

            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    t_neighbors = api.drs_from_table_hit(n)  # Brought old API
                    # t_neighbors = api.make_drs(n)  # XXX: this won't take all table neighbors, only the input one
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here

        def dfs_explore(sources, targets, max_hops, paths):

            # Check if sources have reached targets
            for (s, sibling) in sources:
                if s in targets:
                    # Append successful paths to found_paths
                    # T1.A join T2.B, and T2.C may join with other tables T3.D
                    # get_table_neighbors returns next_candidates (s, sibling) (C,B)
                    # in case T2 is the target add to the path (sibling, sibling)
                    # Otherwise (C,B)
                    if s.source_name == targets[0].source_name:
                        next_paths = append_to_paths(paths, (sibling, sibling))
                    else:
                        next_paths = append_to_paths(paths, (s, sibling))
                    found_paths.extend(next_paths)
                    return True

            # Check if no more hops are allowed:
            if max_hops == 0:
                return False  # not found path

            # Get next set of candidates and keep exploration
            for (s, sibling) in sources:
                next_candidates = get_table_neighbors(s, relation, paths)  # updated paths to test membership
                # recursive on new candidates, one fewer hop and updated paths
                if len(next_candidates) == 0:
                    continue
                next_paths = append_to_paths(paths, (s, sibling))
                dfs_explore(next_candidates, targets, max_hops - 1, next_paths)

        o_drs = DRS([], Operation(OP.NONE))  # Carrier of provenance

        # TODO: same src == trg, etc

        # src_drs = api.drs_from_table(source)
        # trg_drs = api.drs_from_table(target)
        src_drs = api.make_drs(source)
        trg_drs = api.make_drs(target)

        found_paths = []
        candidates = [(x, None) for x in src_drs]  # tuple carrying candidate and same-table attribute

        paths = [[]]  # to carry partial paths

        dfs_explore(candidates, [x for x in trg_drs], max_hops, paths)

        # for p in found_paths:
        #     print(p)

        o_drs = assemble_table_path_provenance(o_drs, found_paths, relation)

        return o_drs
예제 #20
0
    def find_path_hit(self, source, target, relation, max_hops=5):

        def assemble_field_path_provenance(o_drs, path, relation):
            src = path[0]
            tgt = path[-1]
            origin = DRS([src], Operation(OP.ORIGIN))
            o_drs.absorb_provenance(origin)
            prev_c = src
            for c in path[1:-1]:
                nxt = DRS([c], Operation(OP.PKFK, params=[prev_c]))
                o_drs.absorb_provenance(nxt)
                prev_c = c
            sink = DRS([tgt], Operation(OP.PKFK, params=[prev_c]))
            o_drs = o_drs.absorb(sink)
            return o_drs

        def deep_explore(candidates, target_group, already_visited, path, max_hops):
            """
            Recursively depth-first explore the graph, checking if candidates are in target_group
            Returns (boolean, [])
            """
            local_max_hops = max_hops

            if local_max_hops == 0:
                return False

            # first check membership
            for c in candidates:
                if c in target_group:
                    path.insert(0, c)
                    return True

            # if not, then we explore these individually
            for c in candidates:
                if c in already_visited:
                    continue  # next candidate
                else:
                    already_visited.append(c)  # add candidate to set of already visited

                next_level_candidates = [x for x in self.neighbors_id(c, relation)]  # get next set of candidates

                if len(next_level_candidates) == 0:
                    continue
                next_max_hops = local_max_hops - 1  # reduce one level depth and go ahead
                success = deep_explore(next_level_candidates, target_group, already_visited, path, next_max_hops)
                if success:
                    path.insert(0, c)
                    return True
            return False  # if all nodes were already visited

        # maximum number of hops
        max_hops = 5

        o_drs = DRS([], Operation(OP.NONE))  # Carrier of provenance

        # TODO: same src == trg, etc

        path = []

        success = deep_explore([source], [target], [], path, max_hops)
        if success:
            o_drs = assemble_field_path_provenance(o_drs, path, relation)
            return o_drs
        else:
            return DRS([], Operation(OP.NONE))
예제 #21
0
 def drs_from_hit(self, hit: Hit) -> DRS:
     drs = DRS([hit], Operation(OP.ORIGIN))
     return drs
예제 #22
0
 def drs_from_hits(self, hits: [Hit]) -> DRS:
     drs = DRS(hits, Operation(OP.ORIGIN))
     return drs
예제 #23
0
 def drs_from_table_hit(self, hit: Hit) -> DRS:
     table = hit.source_name
     hits = self.__network.get_hits_from_table(table)
     drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit]))
     return drs