コード例 #1
0
ファイル: fieldnetwork.py プロジェクト: delftdata/valentine
        def assemble_table_path_provenance(o_drs, paths, relation):

            for path in paths:
                src, src_sibling = path[0]
                assert (src_sibling is None)  # sibling of source should be None, as source is an origin
                tgt, tgt_sibling = path[-1]
                origin = DRS([src], Operation(OP.ORIGIN))
                o_drs.absorb_provenance(origin)
                prev_c = src
                for c, sibling in path[1:-1]:
                    nxt = DRS([sibling], Operation(OP.PKFK, params=[prev_c]))
                    o_drs.absorb_provenance(nxt)
                    if c.nid != sibling.nid:  # avoid loop on head nodes of the graph
                        linker = DRS([c], Operation(OP.TABLE, params=[sibling]))
                        o_drs.absorb_provenance(linker)
                    prev_c = c
                sink = DRS([tgt_sibling], Operation(OP.PKFK, params=[prev_c]))

                #The join path at the target has None sibling
                if tgt is not None and tgt_sibling is not None and tgt.nid != tgt_sibling.nid:
                    o_drs = o_drs.absorb_provenance(sink)
                    linker = DRS([tgt], Operation(OP.TABLE, params=[tgt_sibling]))
                    o_drs.absorb(linker)
                else:
                    o_drs = o_drs.absorb(sink)
            return o_drs
コード例 #2
0
 def drs_expand_to_table(self, drs: DRS) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     for h in drs:
         table = h.source_name
         hits = self.__network.get_hits_from_table(table)
         drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h]))
         o_drs.absorb(drs)
     return o_drs
コード例 #3
0
ファイル: fieldnetwork.py プロジェクト: delftdata/valentine
 def assemble_field_path_provenance(o_drs, path, relation):
     src = path[0]
     tgt = path[-1]
     origin = DRS([src], Operation(OP.ORIGIN))
     o_drs.absorb_provenance(origin)
     prev_c = src
     for c in path[1:-1]:
         nxt = DRS([c], Operation(OP.PKFK, params=[prev_c]))
         o_drs.absorb_provenance(nxt)
         prev_c = c
     sink = DRS([tgt], Operation(OP.PKFK, params=[prev_c]))
     o_drs = o_drs.absorb(sink)
     return o_drs
コード例 #4
0
 def schema_neighbors_of(self, i_drs: DRS) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
     for h in i_drs:
         hits = self.__network.get_hits_from_table(h.source_name)
         hits_drs = DRS([x for x in hits], Operation(OP.TABLE, params=[h]))
         o_drs = o_drs.absorb(hits_drs)
     return o_drs
コード例 #5
0
    def __neighbor_search(self, input_data, relation: Relation):
        """
        Given an nid, node, hit or DRS, finds neighbors with specified
        relation.
        :param nid, node tuple, Hit, or DRS:
        """
        # convert whatever input to a DRS
        i_drs = self._general_to_drs(input_data)

        # prepare an output DRS
        o_drs = DRS([], Operation(OP.NONE))
        o_drs = o_drs.absorb_provenance(i_drs)

        # get all of the table Hits in a DRS, if necessary.
        if i_drs.mode == DRSMode.TABLE:
            self._general_to_field_drs(i_drs)

        # Check neighbors
        if not relation.from_metadata():
            for h in i_drs:
                hits_drs = self._network.neighbors_id(h, relation)
                o_drs = o_drs.absorb(hits_drs)
        else:
            md_relation = self._relation_to_mdrelation(relation)
            for h in i_drs:
                neighbors = self.md_search(h, md_relation)
                hits_drs = self._network.md_neighbors_id(
                    h, neighbors, relation)
                o_drs = o_drs.absorb(hits_drs)
        return o_drs
コード例 #6
0
    def make_drs(self, general_input):
        """
        Makes a DRS from general_input.
        general_input can include an array of strings, Hits, DRS's, etc,
        or just a single DRS.
        """
        try:

            # If this is a list of inputs, condense it into a single drs
            if isinstance(general_input, list):
                general_input = [
                    self._general_to_drs(x) for x in general_input
                ]

                combined_drs = DRS([], Operation(OP.NONE))
                for drs in general_input:
                    combined_drs = self.union(combined_drs, drs)
                general_input = combined_drs

            # else, just convert it to a DRS
            o_drs = self._general_to_drs(general_input)
            return o_drs
        except:
            msg = (
                '--- Error ---' +
                '\nThis function returns domain result set from the ' +
                'supplied input' +
                '\nusage:\n\tmake_drs( table name/hit id | [table name/hit ' +
                'id, drs/hit/string/int] )' +
                '\ne.g.:\n\tmake_drs(1600820766)')
            print(msg)
コード例 #7
0
 def _drs_from_table_hit_lean_no_provenance(self, hit: Hit) -> DRS:
     # TODO: migrated from old ddapi as there's no good swap
     table = hit.source_name
     hits = self._network.get_hits_from_table(table)
     drs = DRS([x for x in hits],
               Operation(OP.TABLE, params=[hit]),
               lean_drs=True)
     return drs
コード例 #8
0
    def _general_to_drs(self, general_input) -> DRS:
        """
        Given an nid, node, hit, or DRS and convert it to a DRS.
        :param nid: int
        :param node: (db_name, source_name, field_name)
        :param hit: Hit
        :param DRS: DRS
        :return: DRS
        """
        # test for DRS initially for speed
        if isinstance(general_input, DRS):
            return general_input

        if general_input is None:
            general_input = DRS(data=[], operation=Operation(OP.NONE))

        # Test for ints or strings that represent integers
        if self._represents_int(general_input):
            general_input = self._nid_to_hit(general_input)

        # Test for strings that represent tables
        if isinstance(general_input, str):
            hits = self._network.get_hits_from_table(general_input)
            general_input = DRS([x for x in hits], Operation(OP.ORIGIN))

        # Test for tuples that are not Hits
        if (isinstance(general_input, tuple)
                and not isinstance(general_input, Hit)):
            general_input = self._node_to_hit(general_input)

        # Test for Hits
        if isinstance(general_input, Hit):
            field = general_input.field_name
            if field == '' or field is None:
                # If the Hit's field is not defined, it is in table mode
                # and all Hits from the table need to be found
                general_input = self._hit_to_drs(general_input,
                                                 table_mode=True)
            else:
                general_input = self._hit_to_drs(general_input)
        if isinstance(general_input, DRS):
            return general_input

        raise ValueError(
            'Input is not None, an integer, field tuple, Hit, or DRS')
コード例 #9
0
    def _hit_to_drs(self, hit: Hit, table_mode=False) -> DRS:
        """
        Given a Hit, return a DRS. If in table mode, the resulting DRS will
        contain Hits representing that table.
        :param hit: Hit
        :param table_mode: if the Hit represents an entire table
        :return: DRS
        """
        drs = None
        if table_mode:
            table = hit.source_name
            hits = self._network.get_hits_from_table(table)
            drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit]))
            drs.set_table_mode()
        else:
            drs = DRS([hit], Operation(OP.ORIGIN))

        return drs
コード例 #10
0
 def drs_from_table(self, source: str) -> DRS:
     """
     Given a source, it retrieves all fields of the source and returns them
     in the internal representation
     :param source: string with the name of the table
     :return: a DRS with the source-field internal representation
     """
     hits = self.__network.get_hits_from_table(source)
     drs = DRS([x for x in hits], Operation(OP.ORIGIN))
     return drs
コード例 #11
0
    def exact_search(self, kw: str, kw_type: KWType, max_results=10):
        """
        See 'search'. This only returns exact matches.
        """

        hits = self._store_client.exact_search_keywords(
            keywords=kw, elasticfieldname=kw_type, max_hits=max_results)

        # materialize generator
        drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw]))
        return drs
コード例 #12
0
 def table_names_search(self, kws: [str], max_results=10) -> DRS:
     """
     Given a collection of schema names, it returns the matches in the internal representation
     :param kws: collection (iterable) of keywords (strings)
     :return: a DRS
     """
     o_drs = DRS([], Operation(OP.NONE))
     for kw in kws:
         res_drs = self.table_name_search(kw, max_results=max_results)
         o_drs = o_drs.absorb(res_drs)
     return o_drs
コード例 #13
0
 def keyword_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the content of the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a DRS
     """
     hits = store_client.search_keywords(kw, KWType.KW_CONTENT, max_results)
     drs = DRS([x for x in hits],
               Operation(OP.KW_LOOKUP,
                         params=[kw]))  # materialize generator
     return drs
コード例 #14
0
 def schema_name_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the attribute/field names of the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a DRS
     """
     hits = store_client.search_keywords(kw, KWType.KW_SCHEMA, max_results)
     drs = DRS([x for x in hits],
               Operation(OP.SCHNAME_LOOKUP,
                         params=[kw]))  # materialize generator
     return drs
コード例 #15
0
 def schema_neighbors(self, field: (str, str, str)) -> DRS:
     """
     Returns all the other attributes/fields that appear in the same relation than the provided field
     :param field: the provided field
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     db_name, source_name, field_name = field
     hits = self.__network.get_hits_from_table(source_name)
     origin_hit = Hit(id_from(db_name, source_name, field_name), db_name,
                      source_name, field_name, 0)
     o_drs = DRS([x for x in hits], Operation(OP.TABLE,
                                              params=[origin_hit]))
     return o_drs
コード例 #16
0
 def entity_search(self, kw: str, max_results=10) -> DRS:
     """
     Performs a keyword search over the entities represented by the data
     :param kw: the keyword to search
     :param max_results: the maximum number of results to return
     :return: returns a list of Hit elements of the form (id, source_name, field_name, score)
     """
     hits = store_client.search_keywords(kw, KWType.KW_ENTITIES,
                                         max_results)
     drs = DRS([x for x in hits],
               Operation(OP.ENTITY_LOOKUP,
                         params=[kw]))  # materialize generator
     return drs
コード例 #17
0
    def paths(self,
              drs_a: DRS,
              drs_b: DRS,
              relation=Relation.PKFK,
              max_hops=2,
              lean_search=False) -> DRS:
        """
        Is there a transitive relationship between any element in a with any
        element in b?
        This function finds the answer constrained on the primitive
        (singular for now) that is passed as a parameter.
        If b is not passed, assumes the user is searching for paths between
        elements in a.
        :param a: DRS
        :param b: DRS
        :param Relation: Relation
        :return:
        """
        # create b if it wasn't passed in.
        drs_a = self._general_to_drs(drs_a)
        drs_b = self._general_to_drs(drs_b)

        self._assert_same_mode(drs_a, drs_b)

        # absorb the provenance of both a and b
        o_drs = DRS([], Operation(OP.NONE))
        o_drs.absorb_provenance(drs_a)
        if drs_b != drs_a:
            o_drs.absorb_provenance(drs_b)

        for h1, h2 in itertools.product(drs_a, drs_b):

            # there are different network operations for table and field mode
            res_drs = None
            if drs_a.mode == DRSMode.FIELDS:
                res_drs = self._network.find_path_hit(h1,
                                                      h2,
                                                      relation,
                                                      max_hops=max_hops)
            else:
                res_drs = self._network.find_path_table(
                    h1,
                    h2,
                    relation,
                    self,
                    max_hops=max_hops,
                    lean_search=lean_search)

            o_drs = o_drs.absorb(res_drs)

        return o_drs
コード例 #18
0
 def traverse(self, a: DRS, primitives, max_hops) -> DRS:
     o_drs = DRS([], Operation(OP.NONE))
     if a.mode == DRSMode.TABLE:
         print("ERROR: input mode TABLE not supported")
         return []
     fringe = [x for x in a]
     o_drs.absorb_provenance(a)
     while max_hops > 0:
         max_hops = max_hops - 1
         for h in fringe:
             hits_drs = self.__network.neighbors_id(h, primitives)
             o_drs = self.union(o_drs, hits_drs)
         fringe = [x for x in o_drs]  # grow the initial input
     return o_drs
コード例 #19
0
ファイル: fieldnetwork.py プロジェクト: delftdata/valentine
 def md_neighbors_id(self, hit: Hit, md_neighbors: MRS, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     score = 1.0 # TODO: return more meaningful score results
     for hit in md_neighbors:
         k = hit.target if hit.target != nid else hit.source
         (db_name, source_name, field_name, data_type) = self.__id_names[k]
         data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
コード例 #20
0
ファイル: fieldnetwork.py プロジェクト: delftdata/valentine
 def neighbors_id(self, hit: Hit, relation: Relation) -> DRS:
     if isinstance(hit, Hit):
         nid = str(hit.nid)
     if isinstance(hit, str):
         nid = hit
     nid = str(nid)
     data = []
     neighbours = self.__G[nid]
     for k, v in neighbours.items():
         if relation in v:
             score = v[relation]['score']
             (db_name, source_name, field_name, data_type) = self.__id_names[k]
             data.append(Hit(k, db_name, source_name, field_name, score))
     op = self.get_op_from_relation(relation)
     o_drs = DRS(data, Operation(op, params=[hit]))
     return o_drs
コード例 #21
0
 def similar_content_to(self, i_drs: DRS) -> DRS:
     """
     Given a DRS it returns another DRS that contains all fields similar to the fields of the input
     :param i_drs: the input DRS
     :return: DRS
     """
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
     for h in i_drs:
         hits_drs = self.__network.neighbors_id(h, Relation.CONTENT_SIM)
         o_drs = o_drs.absorb(hits_drs)
     return o_drs
コード例 #22
0
ファイル: fieldnetwork.py プロジェクト: delftdata/valentine
        def get_table_neighbors(hit, relation, paths):
            results = []
            direct_neighbors = self.neighbors_id(hit, relation)

            # Rewriting results - filtering out results that are in the same table as the input. Rewriting prov
            direct_neighbors_list = [neigh for neigh in direct_neighbors if neigh.source_name != hit.source_name]
            op = self.get_op_from_relation(relation)
            direct_neighbors = DRS(direct_neighbors_list, Operation(op, params=[hit]))

            # FIXME: filter out already seen nodes here
            for n in direct_neighbors:
                if not check_membership(n, paths):
                    if lean_search:
                        t_neighbors = api._drs_from_table_hit_lean_no_provenance(n)
                    else:
                        t_neighbors = api.drs_from_table_hit(n)  # Brought old API
                    # t_neighbors = api.make_drs(n)  # XXX: this won't take all table neighbors, only the input one
                    results.extend([(x, n) for x in t_neighbors])
            return results  # note how we include hit as sibling of x here
コード例 #23
0
    def search(self, kw: str, kw_type: KWType, max_results=10) -> DRS:
        """
        Performs a keyword search over the contents of the data.
        Scope specifies where elasticsearch should be looking for matches.
        i.e. table titles (SOURCE), columns (FIELD), or comment (SOURCE)

        :param kw: the keyword to serch
        :param kw_type: the context type on which to search
        :param max_results: maximum number of results to return
        :return: returns a DRS
        """

        hits = self._store_client.search_keywords(keywords=kw,
                                                  elasticfieldname=kw_type,
                                                  max_hits=max_results)

        # materialize generator
        drs = DRS([x for x in hits], Operation(OP.KW_LOOKUP, params=[kw]))
        return drs
コード例 #24
0
 def pkfk_of(self, i_drs: DRS) -> DRS:
     """
     Given a DRS it returns another DRS that contains all fields similar to the fields of the input
     :param i_drs: the input DRS
     :return: DRS
     """
     # alternative provenance propagation
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(i_drs)
     if i_drs.mode == DRSMode.TABLE:
         i_drs.set_fields_mode()
         for h in i_drs:
             fields_table = self.drs_from_table_hit(h)
             i_drs = i_drs.absorb(fields_table)
             # o_drs.extend_provenance(fields_drs)
     for h in i_drs:
         hits_drs = self.__network.neighbors_id(h, Relation.PKFK)
         o_drs = o_drs.absorb(hits_drs)
     # o_drs.extend_provenance(i_drs)
     return o_drs
コード例 #25
0
    def __traverse(self, a: DRS, primitive, max_hops=2) -> DRS:
        """
        Conduct a breadth first search of nodes matching a primitive, starting
        with an initial DRS.
        :param a: a nid, node, tuple, or DRS
        :param primitive: The element to search
        :max_hops: maximum number of rounds on the graph
        """
        a = self._general_to_drs(a)

        o_drs = DRS([], Operation(OP.NONE))

        if a.mode == DRSMode.TABLE:
            raise ValueError('input mode DRSMode.TABLE not supported')

        fringe = a
        o_drs.absorb_provenance(a)
        while max_hops > 0:
            max_hops = max_hops - 1
            for h in fringe:
                hits_drs = self._network.neighbors_id(h, primitive)
                o_drs = self.union(o_drs, hits_drs)
            fringe = o_drs  # grow the initial input
        return o_drs
コード例 #26
0
 def paths(self, a: DRS, primitives) -> DRS:
     """
     Is there any transitive relationship between any two elements in a?
     This function finds the answer constrained on the primitive (singular for now) passed as parameter
     :param a:
     :param primitives:
     :return:
     """
     o_drs = DRS([], Operation(OP.NONE))
     o_drs = o_drs.absorb_provenance(a)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in a:  # h2 is a Hit
                 if h1 == h2:
                     continue
                 res_drs = self.__network.find_path_hit(h1, h2, primitives)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in a:  # h2 is a table: str
                 res_drs = self.__network.find_path_table(
                     h1, h2, primitives, self)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
コード例 #27
0
 def paths_between(self, a: DRS, b: DRS, primitives, max_hops=2) -> DRS:
     """
     Is there a transitive relationship between any element in a with any element in b?
     This functions finds the answer constrained on the primitive (singular for now) that is passed
     as a parameter.
     :param a:
     :param b:
     :param primitives:
     :return:
     """
     assert (a.mode == b.mode)
     o_drs = DRS([], Operation(OP.NONE))
     o_drs.absorb_provenance(a)
     o_drs.absorb_provenance(b)
     if a.mode == DRSMode.FIELDS:
         for h1 in a:  # h1 is a Hit
             for h2 in b:  # h2 is a Hit
                 if h1 == h2:
                     return o_drs  # same source and target field
                 res_drs = self.__network.find_path_hit(h1,
                                                        h2,
                                                        primitives,
                                                        max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     elif a.mode == DRSMode.TABLE:
         for h1 in a:  # h1 is a table: str
             for h2 in b:  # h2 is a table: str
                 if h1 == h2:
                     return o_drs  # same source ant target table
                 res_drs = self.__network.find_path_table(h1,
                                                          h2,
                                                          primitives,
                                                          self,
                                                          max_hops=max_hops)
                 o_drs = o_drs.absorb(res_drs)
     return o_drs
コード例 #28
0
 def drs_from_hit(self, hit: Hit) -> DRS:
     drs = DRS([hit], Operation(OP.ORIGIN))
     return drs
コード例 #29
0
 def drs_from_table_hit(self, hit: Hit) -> DRS:
     table = hit.source_name
     hits = self.__network.get_hits_from_table(table)
     drs = DRS([x for x in hits], Operation(OP.TABLE, params=[hit]))
     return drs
コード例 #30
0
 def drs_from_hits(self, hits: [Hit]) -> DRS:
     drs = DRS(hits, Operation(OP.ORIGIN))
     return drs