Ejemplo n.º 1
0
    def _block_candset_part(self, candset, ltable, rtable, fk_ltable,
                            fk_rtable, l_key, r_key, l_block_attr,
                            r_block_attr):
        # 1. create dummy column names to contain the values pulled from ltable and rtable
        # based on the fk's

        if isinstance(candset, pd.DataFrame) and len(candset):

            l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr)
            r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr)

            ltable = (lproj_df)(ltable, l_proj_attrs)
            rtable = (rproj_df)(rtable, r_proj_attrs)

            l_prefix, r_prefix = '__blk_a_', '__blk_b_'

            # add attrs
            cdf = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable,
                                 l_key, r_key, [l_block_attr], [r_block_attr],
                                 l_prefix, r_prefix)
            l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr

            res = candset[cdf[l_chk] == cdf[r_chk]]

            if not isinstance(res, pd.DataFrame):
                print('Returning {0}'.format(res))

            return res
        else:
            return candset
Ejemplo n.º 2
0
    def _block_table_part(self, ltable, rtable, l_key, r_key, l_block_attr,
                          r_block_attr,
                          tokenizer, threshold, rem_stop_words, l_output_attrs,
                          r_output_attrs, l_output_prefix, r_output_prefix):
        l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs)
        r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs)



        ltable = (lproj_df)(ltable, l_proj_attrs)
        rtable = (rproj_df)(rtable, r_proj_attrs)

        ltbl = ltable[~ltable[l_block_attr].isnull()]
        rtbl = rtable[~rtable[r_block_attr].isnull()]

        l_strings = self._preprocess_table(ltbl, l_key, l_block_attr, rem_stop_words)
        l_tokens = tokenize_strings(l_strings, tokenizer)
        inv_index = build_inv_index([l_tokens])

        r_strings = self._preprocess_table(rtbl, r_key, r_block_attr, rem_stop_words)
        r_tokens = tokenize_strings(r_strings, tokenizer)

        candset = self._probe(r_tokens, inv_index, threshold)
        fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key
        candset = pd.DataFrame(candset.get_pairids(), columns=[fk_ltable, fk_rtable])
        candset = add_attributes(candset, ltbl, rtbl, fk_ltable, fk_rtable, l_key,
                                 r_key, l_output_attrs, r_output_attrs,
                                 l_output_prefix, r_output_prefix)
        if not isinstance(candset, pd.DataFrame):
            print('Returning {0}'.format(candset))

        return candset
Ejemplo n.º 3
0
    def _block_table_part(self, ltable, rtable, l_key, r_key, l_block_attr,
                          r_block_attr, l_out_attrs, r_out_attrs, l_prefix,
                          r_prefix):

        l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr,
                                               l_out_attrs)
        r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr,
                                               r_out_attrs)

        ltbl = (lproj_df)(ltable, l_proj_attrs)
        rtbl = (rproj_df)(rtable, r_proj_attrs)

        # join the tables
        # ltbl = lproj_df(ltable, [l_key, l_block_attr])
        # rtbl = rproj_df(rtable, [r_key, r_block_attr])
        res = ltbl.merge(rtbl, left_on=l_block_attr, right_on=r_block_attr)

        # get the cols to project & project
        lcol, rcol = l_key + '_x', r_key + '_y'
        res = candproj_df(res, [lcol, rcol])

        # rename_cols the fk columns to conform with given prefix
        lcol, rcol = l_prefix + l_key, r_prefix + r_key
        res = rename_cols(res, [lcol, rcol])

        # add the required output attrs.
        res = add_attributes(res, ltable, rtable, lcol, rcol, l_key, r_key,
                             l_out_attrs, r_out_attrs, l_prefix, r_prefix)

        # finally return the result.
        if not isinstance(res, pd.DataFrame):
            print('Returning {0}'.format(res))
        return res
Ejemplo n.º 4
0
    def _block_tables_part(self, ltable, rtable, l_key, r_key, l_output_attrs,
                           r_output_attrs, l_output_prefix, r_output_prefix):
        fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key
        candset, rule_applied = self._block_tables_with_filters(
            ltable, rtable, l_key, r_key)

        l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs,
                                               l_output_attrs)
        # needs to be modified as self.ltable_attrs can be None.
        r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs,
                                               r_output_attrs)
        ltable = lproj_df(ltable, l_proj_attrs)
        rtable = rproj_df(rtable, r_proj_attrs)
        if candset is None:
            candset = self._block_tables_without_filters(
                ltable, rtable, l_key, r_key, fk_ltable, fk_rtable)
        elif len(self.rules) > 1:

            candset = self._block_candset_excluding_rule(
                candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key,
                rule_applied)
        # candset = pd.DataFrame(candset, )candset
        candset = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable,
                                 l_key, r_key, l_output_attrs, r_output_attrs,
                                 l_output_prefix, r_output_prefix)

        return candset
Ejemplo n.º 5
0
    def _block_tables_part(self, ltable, rtable, l_key, r_key, l_output_attrs,
                           r_output_attrs, l_output_prefix, r_output_prefix):
        l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs,
                                               l_output_attrs)
        r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs,
                                               r_output_attrs)

        ltbl = (lproj_df)(ltable, l_proj_attrs)
        rtbl = (rproj_df)(rtable, r_proj_attrs)

        ltbl.set_index(l_key, inplace=True, drop=False)
        rtbl.set_index(r_key, inplace=True, drop=False)

        l_dict = ltbl.T.to_dict()
        r_dict = rtbl.T.to_dict()

        valid_pairs = []
        for l_id in l_dict.keys():
            ltuple = l_dict[l_id]
            for r_id in r_dict.keys():
                rtuple = r_dict[r_id]
                res = self.black_box_function(ltuple, rtuple)
                if not res:
                    valid_pairs.append([l_id, r_id])
        fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key
        candset = pd.DataFrame(valid_pairs, columns=[fk_ltable, fk_rtable])
        candset = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable,
                                 l_key, r_key, l_output_attrs, r_output_attrs,
                                 l_output_prefix, r_output_prefix)
        # if not isinstance(candset, pd.DataFrame):
        #     print('Returning {0}'.format(candset))

        return candset
Ejemplo n.º 6
0
def block_candset_part(candset, ltable, rtable, fk_ltable, fk_rtable, l_key,
                       r_key, l_block_attr, r_block_attr):
    # 1. create dummy column names to contain the values pulled from ltable and rtable
    # based on the fk's

    l_prefix, r_prefix = '__blk_a_', '__blk_b_'

    # add attrs
    cdf = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable, l_key,
                         r_key, [l_block_attr], [r_block_attr], l_prefix,
                         r_prefix)
    l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr

    res = candset[cdf[l_chk] == cdf[r_chk]]

    return res
Ejemplo n.º 7
0
def block_table_chunks(ldf, rdf, l_key, r_key, l_attr, r_attr, tokenizer,
                       threshold, stopwords, l_out, r_out, l_prefix, r_prefix):
    ldf = ldf[~ldf[l_attr].isnull()]
    rdf = rdf[~rdf[r_attr].isnull()]
    lstrings = preprocess_table(ldf, l_attr, l_key, stopwords)
    ltokens = tokenize_strings(lstrings, tokenizer)
    invindex = build_inv_index([ltokens])

    rstrings = preprocess_table(rdf, r_attr, r_key, stopwords)
    rtokens = tokenize_strings(rstrings, tokenizer)
    res = probe(rtokens, invindex, threshold)
    lcol, rcol = l_prefix + l_key, r_prefix + r_key
    res = pd.DataFrame(res.get_pairids(), columns=[lcol, rcol])
    if len(res):
        res = add_attributes(res, ldf, rdf, lcol, rcol, l_key, r_key, l_out,
                             r_out, l_prefix, r_prefix)
    return res
Ejemplo n.º 8
0
    def _block_candset_part(self, candset, ltable, rtable, fk_ltable,
                            fk_rtable, l_key, r_key, l_block_attr,
                            r_block_attr, rem_stop_words, tokenizer,
                            threshold):
        if isinstance(candset, pd.DataFrame) and len(candset):
            l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr)
            r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr)

            # ltbl = (lproj_df)(ltable, l_proj_attrs)
            # rtbl = (rproj_df)(rtable, r_proj_attrs)

            ltable = (lproj_df)(ltable, l_proj_attrs)
            rtable = (rproj_df)(rtable, r_proj_attrs)

            ltbl = ltable[
                ~ltable[l_block_attr].isnull()]  # this might be redundant
            rtbl = rtable[
                ~rtable[r_block_attr].isnull()]  # this might be redundant
            l_prefix, r_prefix = '__blk_a_', '__blk_b_'

            temp_candset = add_attributes(candset, ltbl, rtbl, fk_ltable,
                                          fk_rtable, l_key, r_key,
                                          [l_block_attr], [r_block_attr],
                                          l_prefix, r_prefix)
            l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr

            x = self._process_column(temp_candset[l_chk], rem_stop_words)
            x = x.map(str2bytes).map(tokenizer.tokenize)

            y = self._process_column(temp_candset[r_chk], rem_stop_words)
            y = y.map(str2bytes).map(tokenizer.tokenize)

            overlap_fn = partial(self._compute_overlap, threshold=threshold)
            tmp = pd.DataFrame()
            tmp['x'] = x
            tmp['y'] = y
            # print(tmp)
            valid = tmp.apply(overlap_fn, raw=True, axis=1)
            # print(len(valid))
            valid_candset = candset[valid.values]
            return valid_candset
        else:
            candset
Ejemplo n.º 9
0
def block_table_part(ltable, rtable, l_key, r_key, l_block_attr, r_block_attr,
                     l_out_attrs, r_out_attrs, l_prefix, r_prefix):
    # join the tables
    ltbl = proj_df(ltable, [l_key, l_block_attr])
    rtbl = proj_df(rtable, [r_key, r_block_attr])
    res = ltbl.merge(rtbl, left_on=l_block_attr, right_on=r_block_attr)

    # get the cols to project & project
    lcol, rcol = l_key + '_x', r_key + '_y'
    res = proj_df(res, [lcol, rcol])

    # rename_cols the fk columns to conform with given prefix
    lcol, rcol = l_prefix + l_key, r_prefix + r_key
    res = rename_cols(res, [lcol, rcol])

    # add the required output attrs.
    res = add_attributes(res, ltable, rtable, lcol, rcol, l_key, r_key,
                         l_out_attrs, r_out_attrs, l_prefix, r_prefix)

    # finally return the result.
    return res
Ejemplo n.º 10
0
def block_candset_chunks(candset, ldf, rdf, fk_ltable, fk_rtable, l_key, r_key,
                         l_attr, r_attr, tokenizer, threshold, stopwords):
    #ldf = ldf.dropna()
    #rdf = rdf.dropna()
    ldf = ldf[~ldf[l_attr].isnull()]
    rdf = rdf[~rdf[r_attr].isnull()]
    tmp = pd.DataFrame()
    l_prefix, r_prefix = '__blk_a_', '__blk_b_'
    cdf = add_attributes(candset, ldf, rdf, fk_ltable, fk_rtable, l_key, r_key,
                         [l_attr], [r_attr], l_prefix, r_prefix)
    l_chk, r_chk = l_prefix + l_attr, r_prefix + r_attr

    x = process_col(cdf[l_chk], stopwords)
    x = x.map(str2bytes).map(tokenizer.tokenize)

    y = process_col(cdf[r_chk], stopwords)
    y = y.map(str2bytes).map(tokenizer.tokenize)

    overlap_fn = partial(compute_overlap, threshold=threshold)
    tmp['x'] = x
    tmp['y'] = y
    valid = tmp.apply(overlap_fn, raw=True, axis=1)
    res = candset[valid.values]
    return res