def _block_candset_part(self, candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_block_attr, r_block_attr): # 1. create dummy column names to contain the values pulled from ltable and rtable # based on the fk's if isinstance(candset, pd.DataFrame) and len(candset): l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr) r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr) ltable = (lproj_df)(ltable, l_proj_attrs) rtable = (rproj_df)(rtable, r_proj_attrs) l_prefix, r_prefix = '__blk_a_', '__blk_b_' # add attrs cdf = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, [l_block_attr], [r_block_attr], l_prefix, r_prefix) l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr res = candset[cdf[l_chk] == cdf[r_chk]] if not isinstance(res, pd.DataFrame): print('Returning {0}'.format(res)) return res else: return candset
def _block_tables_part(self, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix): l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs, l_output_attrs) r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs, r_output_attrs) ltbl = (lproj_df)(ltable, l_proj_attrs) rtbl = (rproj_df)(rtable, r_proj_attrs) ltbl.set_index(l_key, inplace=True, drop=False) rtbl.set_index(r_key, inplace=True, drop=False) l_dict = ltbl.T.to_dict() r_dict = rtbl.T.to_dict() valid_pairs = [] l_keys = l_dict.keys() r_keys = r_dict.keys() for l_id in l_keys: ltuple = l_dict[l_id] for r_id in r_keys: rtuple = r_dict[r_id] res = self.black_box_function(ltuple, rtuple) if not res: valid_pairs.append([l_id, r_id]) fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key candset = pd.DataFrame(valid_pairs, columns=[fk_ltable, fk_rtable]) candset = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) # if not isinstance(candset, pd.DataFrame): # print('Returning {0}'.format(candset)) return candset
def _block_table_part(self, ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, l_out_attrs, r_out_attrs, l_prefix, r_prefix): l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_out_attrs) r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_out_attrs) ltbl = (lproj_df)(ltable, l_proj_attrs) rtbl = (rproj_df)(rtable, r_proj_attrs) # join the tables # ltbl = lproj_df(ltable, [l_key, l_block_attr]) # rtbl = rproj_df(rtable, [r_key, r_block_attr]) res = ltbl.merge(rtbl, left_on=l_block_attr, right_on=r_block_attr) # get the cols to project & project lcol, rcol = l_key + '_x', r_key + '_y' res = candproj_df(res, [lcol, rcol]) # rename_cols the fk columns to conform with given prefix lcol, rcol = l_prefix + l_key, r_prefix + r_key res = rename_cols(res, [lcol, rcol]) # add the required output attrs. res = add_attributes(res, ltable, rtable, lcol, rcol, l_key, r_key, l_out_attrs, r_out_attrs, l_prefix, r_prefix) # finally return the result. if not isinstance(res, pd.DataFrame): print('Returning {0}'.format(res)) return res
def _block_table_part(self, inv_index, ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, tokenizer, threshold, rem_stop_words, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix): l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr, l_output_attrs) r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr, r_output_attrs) ltable = (lproj_df)(ltable, l_proj_attrs) rtable = (rproj_df)(rtable, r_proj_attrs) ltbl = ltable[~ltable[l_block_attr].isnull()] rtbl = rtable[~rtable[r_block_attr].isnull()] # l_strings = self._preprocess_table(ltbl, l_key, l_block_attr, rem_stop_words) # l_tokens = tokenize_strings(l_strings, tokenizer) # inv_index = build_inv_index([l_tokens]) r_strings = self._preprocess_table(rtbl, r_key, r_block_attr, rem_stop_words) r_tokens = tokenize_strings(r_strings, tokenizer) candset = self._probe(r_tokens, inv_index, threshold) fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key candset = pd.DataFrame(candset.get_pairids(), columns=[fk_ltable, fk_rtable]) candset = add_attributes(candset, ltbl, rtbl, fk_ltable, fk_rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if not isinstance(candset, pd.DataFrame): print('Returning {0}'.format(candset)) return candset
def _block_tables_part(self, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix): fk_ltable, fk_rtable = l_output_prefix + l_key, r_output_prefix + r_key candset, rule_applied = self._block_tables_with_filters( ltable, rtable, l_key, r_key) l_proj_attrs = (get_lattrs_to_project)(l_key, self.ltable_attrs, l_output_attrs) # needs to be modified as self.ltable_attrs can be None. r_proj_attrs = (get_rattrs_to_project)(r_key, self.rtable_attrs, r_output_attrs) ltable = lproj_df(ltable, l_proj_attrs) rtable = rproj_df(rtable, r_proj_attrs) if candset is None: candset = self._block_tables_without_filters( ltable, rtable, l_key, r_key, fk_ltable, fk_rtable) elif len(self.rules) > 1: candset = self._block_candset_excluding_rule( candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, rule_applied) # candset = pd.DataFrame(candset, )candset candset = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) return candset
def block_candset_part(candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_block_attr, r_block_attr): # 1. create dummy column names to contain the values pulled from ltable and rtable # based on the fk's l_prefix, r_prefix = '__blk_a_', '__blk_b_' # add attrs cdf = add_attributes(candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, [l_block_attr], [r_block_attr], l_prefix, r_prefix) l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr res = candset[cdf[l_chk] == cdf[r_chk]] return res
def block_table_chunks(ldf, rdf, l_key, r_key, l_attr, r_attr, tokenizer, threshold, stopwords, l_out, r_out, l_prefix, r_prefix): ldf = ldf[~ldf[l_attr].isnull()] rdf = rdf[~rdf[r_attr].isnull()] lstrings = preprocess_table(ldf, l_attr, l_key, stopwords) ltokens = tokenize_strings(lstrings, tokenizer) invindex = build_inv_index([ltokens]) rstrings = preprocess_table(rdf, r_attr, r_key, stopwords) rtokens = tokenize_strings(rstrings, tokenizer) res = probe(rtokens, invindex, threshold) lcol, rcol = l_prefix + l_key, r_prefix + r_key res = pd.DataFrame(res.get_pairids(), columns=[lcol, rcol]) if len(res): res = add_attributes(res, ldf, rdf, lcol, rcol, l_key, r_key, l_out, r_out, l_prefix, r_prefix) return res
def _block_candset_part(self, candset, ltable, rtable, fk_ltable, fk_rtable, l_key, r_key, l_block_attr, r_block_attr, rem_stop_words, tokenizer, threshold): if isinstance(candset, pd.DataFrame) and len(candset): l_proj_attrs = (get_lattrs_to_project)(l_key, l_block_attr) r_proj_attrs = (get_rattrs_to_project)(r_key, r_block_attr) # ltbl = (lproj_df)(ltable, l_proj_attrs) # rtbl = (rproj_df)(rtable, r_proj_attrs) ltable = (lproj_df)(ltable, l_proj_attrs) rtable = (rproj_df)(rtable, r_proj_attrs) ltbl = ltable[ ~ltable[l_block_attr].isnull()] # this might be redundant rtbl = rtable[ ~rtable[r_block_attr].isnull()] # this might be redundant l_prefix, r_prefix = '__blk_a_', '__blk_b_' temp_candset = add_attributes(candset, ltbl, rtbl, fk_ltable, fk_rtable, l_key, r_key, [l_block_attr], [r_block_attr], l_prefix, r_prefix) l_chk, r_chk = l_prefix + l_block_attr, r_prefix + r_block_attr x = self._process_column(temp_candset[l_chk], rem_stop_words) x = x.map(str2bytes).map(tokenizer.tokenize) y = self._process_column(temp_candset[r_chk], rem_stop_words) y = y.map(str2bytes).map(tokenizer.tokenize) overlap_fn = partial(self._compute_overlap, threshold=threshold) tmp = pd.DataFrame() tmp['x'] = x tmp['y'] = y # print(tmp) valid = tmp.apply(overlap_fn, raw=True, axis=1) # print(len(valid)) valid_candset = candset[valid.values] return valid_candset else: candset
def block_table_part(ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, l_out_attrs, r_out_attrs, l_prefix, r_prefix): # join the tables ltbl = proj_df(ltable, [l_key, l_block_attr]) rtbl = proj_df(rtable, [r_key, r_block_attr]) res = ltbl.merge(rtbl, left_on=l_block_attr, right_on=r_block_attr) # get the cols to project & project lcol, rcol = l_key + '_x', r_key + '_y' res = proj_df(res, [lcol, rcol]) # rename_cols the fk columns to conform with given prefix lcol, rcol = l_prefix + l_key, r_prefix + r_key res = rename_cols(res, [lcol, rcol]) # add the required output attrs. res = add_attributes(res, ltable, rtable, lcol, rcol, l_key, r_key, l_out_attrs, r_out_attrs, l_prefix, r_prefix) # finally return the result. return res
def block_candset_chunks(candset, ldf, rdf, fk_ltable, fk_rtable, l_key, r_key, l_attr, r_attr, tokenizer, threshold, stopwords): #ldf = ldf.dropna() #rdf = rdf.dropna() ldf = ldf[~ldf[l_attr].isnull()] rdf = rdf[~rdf[r_attr].isnull()] tmp = pd.DataFrame() l_prefix, r_prefix = '__blk_a_', '__blk_b_' cdf = add_attributes(candset, ldf, rdf, fk_ltable, fk_rtable, l_key, r_key, [l_attr], [r_attr], l_prefix, r_prefix) l_chk, r_chk = l_prefix + l_attr, r_prefix + r_attr x = process_col(cdf[l_chk], stopwords) x = x.map(str2bytes).map(tokenizer.tokenize) y = process_col(cdf[r_chk], stopwords) y = y.map(str2bytes).map(tokenizer.tokenize) overlap_fn = partial(compute_overlap, threshold=threshold) tmp['x'] = x tmp['y'] = y valid = tmp.apply(overlap_fn, raw=True, axis=1) res = candset[valid.values] return res