def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True): self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # ----------------------------------- metadata related stuff---------------------------------------------- # required metadata: keys for the input tables. helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # get metadata l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cg.validate_metadata_for_table(ltable, l_key, 'left', logger, verbose) cg.validate_metadata_for_table(rtable, r_key, 'right', logger, verbose) # ----------------------------------- metadata related stuff---------------------------------------------- # remove nans : should be modified based on the policy for handling missing values l_df, r_df = self.rem_nan(ltable, l_block_attr), self.rem_nan(rtable, r_block_attr) # do blocking candset = pd.merge(l_df, r_df, left_on=l_block_attr, right_on=r_block_attr, suffixes=('_ltable', '_rtable')) # construct output table retain_cols, final_cols = self.output_columns(l_key, r_key, list(candset.columns), l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] candset.columns = final_cols # Update catalog for the candidate set key = helper.get_name_for_key(candset.columns) candset = helper.add_key_column(candset, key) cg.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True, show_progress=True): # validations self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # required metadata; keys from ltable and rtable helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # get metadata l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # do blocking if word_level == True and q_val != None: raise SyntaxError('Parameters word_level and q_val cannot be set together; Note that word_level is ' 'set to True by default, so explicity set word_level=false to use qgram with the ' 'specified q_val') # #rem nans l_df = self.rem_nan(ltable, l_overlap_attr) r_df = self.rem_nan(rtable, r_overlap_attr) # #reset indexes in the dataframe l_df.reset_index(inplace=True, drop=True) r_df.reset_index(inplace=True, drop=True) # #create a dummy column with all values set to 1. l_df['_dummy_'] = 1 # need to fix this - should be a name that does not occur in the col. names r_df['_dummy_'] = 1 # #case the column to string if required. if l_df.dtypes[l_overlap_attr] != object: logger.warning('Left overlap attribute is not of type string; coverting to string temporarily') l_df[l_overlap_attr] = l_df[l_overlap_attr].astype(str) if r_df.dtypes[r_overlap_attr] != object: logger.warning('Right overlap attribute is not of type string; coverting to string temporarily') r_df[r_overlap_attr] = r_df[r_overlap_attr].astype(str) l_dict = {} r_dict = {} # #create a lookup table for quick access for k, r in l_df.iterrows(): l_dict[k] = r for k, r in r_df.iterrows(): r_dict[k] = r l_colvalues_chopped = self.process_table(l_df, l_overlap_attr, q_val, rem_stop_words) zipped_l_colvalues = zip(l_colvalues_chopped, range(0, len(l_colvalues_chopped))) appended_l_colidx_values = [self. append_index_values(val[0], val[1]) for val in zipped_l_colvalues] inv_idx = {} sink = [self.compute_inv_index(t, inv_idx) for c in appended_l_colidx_values for t in c] r_colvalues_chopped = self.process_table(r_df, r_overlap_attr, q_val, rem_stop_words) r_idx = 0 white_list = [] if show_progress: bar = pyprind.ProgBar(len(r_colvalues_chopped)) df_list = [] for col_values in r_colvalues_chopped: if show_progress: bar.update() qualifying_ltable_indices = self.get_potential_match_indices(col_values, inv_idx, overlap_size) r_row = r_dict[r_idx] r_row_dict = r_row.to_frame().T l_rows_dict = l_df.iloc[qualifying_ltable_indices] df = l_rows_dict.merge(r_row_dict, on='_dummy_', suffixes=('_ltable', '_rtable')) if len(df) > 0: df_list.append(df) # Construct the output table candset = pd.concat(df_list) l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left') r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right') retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # Update metadata in the catalog key = helper.get_name_for_key(candset.columns) candset = helper.add_key_column(candset, key) cg.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return the candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=True, show_progress=True): # validate the presence of black box function if self.black_box_function is None: raise AssertionError('Black box function is not set') # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # metadata related stuff.. # required metadata: keys for the input tables helper.log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # get metadata l_key, r_key = cg.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata. cg.validate_metadata_for_table(ltable, l_key, 'left', logger, verbose) cg.validate_metadata_for_table(rtable, r_key, 'right', logger, verbose) # do blocking if show_progress: bar = pyprind.ProgBar(len(ltable)*len(rtable)) # #keep track of the list that survives blocking block_list = [] # #set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # #create look up index for faster processing. l_dict={} for k, r in l_df.iterrows(): l_dict[k] = r r_dict={} for k, r in r_df.iterrows(): r_dict[k] = r # #get the position of the id attribute in the tables. l_id_pos = list(ltable.columns).index(l_key) r_id_pos = list(rtable.columns).index(r_key) # #iterate through the tuples and apply the black box function for l_t in ltable.itertuples(index=False): for r_t in rtable.itertuples(index=False): if show_progress: bar.update() l_tuple = l_dict[l_t[l_id_pos]] r_tuple = r_dict[r_t[r_id_pos]] res = self.black_box_function(l_tuple, r_tuple) if not res is True: # "not" because, we want to include only tuple pairs that SURVIVE the blocking fn. d = OrderedDict() # #add ltable and rtable ids ltable_id = l_output_prefix+l_key rtable_id = r_output_prefix+r_key d[ltable_id] = l_tuple[l_key] d[rtable_id] = r_tuple[r_key] # #add left table attributes if l_output_attrs: l_out = l_tuple[l_output_attrs] l_out.index = l_output_prefix + l_out.index d.update(l_out) # #add right table attributes if r_output_attrs: r_out = r_tuple(r_output_attrs) r_out.index = r_output_prefix + r_out.index d.update(r_out) # #add the ordered dict to block_list block_list.append(d) # Construct the output table candset = pd.DataFrame(block_list) l_output_attrs = self.process_output_attrs(ltable, l_key, l_output_attrs, 'left') r_output_attrs = self.process_output_attrs(rtable, r_key, r_output_attrs, 'right') retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update metadata in the catalog. key = helper.get_name_for_key(candset.columns) candset = helper.add_key_column(candset, key) cg.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return the candidate set return candset