def test_validate_metadata_for_table_valid_2(self): import logging logger = logging.getLogger(__name__) A = pd.read_csv(path_a) status = cm._validate_metadata_for_table(A, 'ID', 'table', logger, True) self.assertEqual(status, True)
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, n_jobs) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, window_size=2, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """ WARNING: THIS IS AN EXPERIMENTAL COMMAND. THIS COMMAND IS NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on sorted neighborhood. Finds tuple pairs from left and right tables such that when each table is sorted based upon a blocking attribute, tuple pairs are within a distance w of each other. The blocking attribute is created prior to calling this function. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute for left table. r_block_attr (string): The blocking attribute for right table. window_size (int): size of sliding window. Defaults to 2 l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `window_size` is not of type of int or if window_size < 2. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. """ # Warning that this code is still in alpha stage # display warning message print( "WARNING: THIS IS AN EXPERIMENTAL COMMAND. THIS COMMAND IS NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # check if ltable or rtable are empty. if ltable.empty: raise AssertionError('Left table is empty') if rtable.empty: raise AssertionError('Right table is empty') # check if window_size < 2 if window_size < 2: raise AssertionError('window_size is < 2') # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, min(len(ltable), len(rtable))) # handle potential missing values c_missing = pd.DataFrame() if n_procs <= 1: # single process c_splits, c_missing = _sn_block_tables_split( ltable, rtable, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, allow_missing) else: # multiprocessing # Split l and r into n_procs chunks. # each core will get an l and an r, merge them, sort them. l_splits = pd.np.array_split(ltable, n_procs) r_splits = pd.np.array_split(rtable, n_procs) p_answer = Parallel(n_jobs=n_procs)( delayed(_sn_block_tables_split) (l_splits[i], r_splits[i], l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, allow_missing) for i in range(n_procs)) c_splits, c_missing = zip(*p_answer) c_splits = list(c_splits) c_missing = pd.concat(c_missing) # make a deque for the sliding window sliding_window = deque() result = [] c_missing = c_missing.to_dict(orient='records') # Use generator function to merge sorted runs. # If single core, generator is trivial (see fn below) for row in _gen_iter_merge(c_splits): row = row._asdict() # if the sliding window is full, remove the largest. The new tuple will be # compared against the (window_size-1) previously seen tuples. # (if at the beginning just compare with whatever we have) if len(sliding_window) >= window_size: sliding_window.popleft() # Now, iterate over the sliding window (plus any tuples missing BKV's, # if that was called for): for window_element in chain(sliding_window, c_missing): ltable = window_element rtable = row # SN blocking is often implemented on a single table. # In this implementation, we are only considering tuples that have # one tuple from the left table and one tuple from the right table. # Thus, only keep candidates that span both tables. # However, the restriction is that matches need to be (left, right) so # if we end up with (right, left) flip it. if ltable["source"] != rtable["source"]: # Span both tables if ltable[ "source"] == 'r': # Left is right, so flip it to make it sane again ltable, rtable = rtable, ltable merged = OrderedDict() merged[l_output_prefix + "ID"] = ltable[l_key] merged[r_output_prefix + "ID"] = rtable[r_key] merged[l_output_prefix + l_key] = ltable[l_key] merged[r_output_prefix + r_key] = rtable[r_key] # # add l/r output attributes to the ordered dictionary if l_output_attrs is not None: for attr in l_output_attrs: merged[l_output_prefix + attr] = ltable[attr] if r_output_attrs is not None: for attr in r_output_attrs: merged[r_output_prefix + attr] = rtable[attr] # # add the ordered dict to the list result.append(merged) sliding_window.append(row) candset = pd.DataFrame(result, columns=result[0].keys()) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """Blocks two tables based on attribute equivalence. Finds tuple pairs from left and right tables such that the value of attribute l_block_attr of a tuple from the left table exactly matches the value of attribute r_block_attr of a tuple from the right table. This is similar to equi-join of two tables. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = pd.np.array_split(l_df, m) r_splits = pd.np.array_split(r_df, n) c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)( l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) for l in l_splits for r in r_splits) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value( ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = np.array_split(l_df, n_ltable_chunks) r_splits = np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def test_validate_metadata_for_table_key_notin_catalog(self): A = pd.read_csv(path_a) status = cm._validate_metadata_for_table(A, 'ID1', 'table', None, False)
def test_validate_metadata_for_table_valid_1(self): A = pd.read_csv(path_a) status = cm._validate_metadata_for_table(A, 'ID', 'table', None, False) self.assertEqual(status, True)
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs,r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = pd.np.array_split(l_df, m) r_splits = pd.np.array_split(r_df, n) c_splits = Parallel(n_jobs=m*n)(delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress and i == len(l_splits) - 1 and j == len(r_splits) - 1) for i in range(len(l_splits)) for j in range(len(r_splits))) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset =pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> bb = em.BlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs,r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = pd.np.array_split(l_df, m) r_splits = pd.np.array_split(r_df, n) c_splits = Parallel(n_jobs=m*n)(delayed(_block_tables_split)(l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress and i == len(l_splits) - 1 and j == len(r_splits) - 1) for i in range(len(l_splits)) for j in range(len(r_splits))) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset =pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix+l_key, r_output_prefix+r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1) # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2) # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) # Use all the cores in the machine >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], n_jobs=-1) """ # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate data type of show_progress self.validate_show_progress(show_progress) # validate overlap attributes self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_proj_attrs = self.get_attrs_to_project(l_key, l_overlap_attr, l_output_attrs) l_df = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_overlap_attr, r_output_attrs) r_df = rtable[r_proj_attrs] # # case the column to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words l_dummy_overlap_attr = '@#__xx__overlap_ltable__#@' r_dummy_overlap_attr = '@#__xx__overlap_rtable__#@' l_df[l_dummy_overlap_attr] = l_df[l_overlap_attr] r_df[r_dummy_overlap_attr] = r_df[r_overlap_attr] if not l_df.empty: self.cleanup_table(l_df, l_dummy_overlap_attr, rem_stop_words) if not r_df.empty: self.cleanup_table(r_df, r_dummy_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # perform overlap similarity join candset = overlap_join(l_df, r_df, l_key, r_key, l_dummy_overlap_attr, r_dummy_overlap_attr, tokenizer, overlap_size, '>=', allow_missing, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, n_jobs, show_progress) # # retain only the required attributes in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = candset[retain_cols] # update metadata in the catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return the candidate set return candset
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset
def backup_debug_blocker(candset, ltable, rtable, output_size=200, attr_corres=None, verbose=False): """ This is the old version of the blocker debugger. It is not reccomended to use this version unless the new blocker debugger is not working properly. This function debugs the blocker output and reports a list of potential matches that are discarded by a blocker (or a blocker sequence). Specifically, this function takes in the two input tables for matching and the candidate set returned by a blocker (or a blocker sequence), and produces a list of tuple pairs which are rejected by the blocker but with high potential of being true matches. Args: candset (DataFrame): The candidate set generated by applying the blocker on the ltable and rtable. ltable,rtable (DataFrame): The input DataFrames that are used to generate the blocker output. output_size (int): The number of tuple pairs that will be returned (defaults to 200). attr_corres (list): A list of attribute correspondence tuples. When ltable and rtable have different schemas, or the same schema but different words describing the attributes, the user needs to manually specify the attribute correspondence. Each element in this list should be a tuple of strings which are the corresponding attributes in ltable and rtable. The default value is None, and if the user doesn't specify this list, a built-in function for finding the attribute correspondence list will be called. But we highly recommend the users manually specify the attribute correspondences, unless the schemas of ltable and rtable are identical (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). Returns: A pandas DataFrame with 'output_size' number of rows. Each row in the DataFrame is a tuple pair which has potential of being a true match, but is rejected by the blocker (meaning that the tuple pair is in the Cartesian product of ltable and rtable subtracted by the candidate set). The fields in the returned DataFrame are from ltable and rtable, which are useful for determining similar tuple pairs. Raises: AssertionError: If `ltable`, `rtable` or `candset` is not of type pandas DataFrame. AssertionError: If `ltable` or `rtable` is empty (size of 0). AssertionError: If the output `size` parameter is less than or equal to 0. AssertionError: If the attribute correspondence (`attr_corres`) list is not in the correct format (a list of tuples). AssertionError: If the attribute correspondence (`attr_corres`) cannot be built correctly. Examples: >>> import py_entitymatching as em >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, l_overlap_attr='title', r_overlap_attr='title', overlap_size=3) >>> corres = [('ID','ssn'), ('name', 'ename'), ('address', 'location'),('zipcode', 'zipcode')] >>> D = em.backup_debug_blocker(C, A, B, attr_corres=corres) >>> import py_entitymatching as em >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, l_overlap_attr='name', r_overlap_attr='name', overlap_size=3) >>> D = em.backup_debug_blocker(C, A, B, output_size=150) """ # Check input types. _validate_types(ltable, rtable, candset, output_size, attr_corres, verbose) # Check table size. if len(ltable) == 0: raise AssertionError('Error: ltable is empty!') if len(rtable) == 0: raise AssertionError('Error: rtable is empty!') # Check the value of output size. if output_size <= 0: raise AssertionError('The input parameter: \'output_size\'' ' is less than or equal to 0. Nothing needs' ' to be done!') # Get table metadata. l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # Validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # Check the user input field correst list (if exists) and get the raw # version of our internal correst list. _check_input_field_correspondence_list(ltable, rtable, attr_corres) corres_list = _get_field_correspondence_list(ltable, rtable, l_key, r_key, attr_corres) # Build the (col_name: col_index) dict to speed up locating a field in # the schema. ltable_col_dict = _build_col_name_index_dict(ltable) rtable_col_dict = _build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider # string types for document concatenation. _filter_corres_list(ltable, rtable, l_key, r_key, ltable_col_dict, rtable_col_dict, corres_list) # Get field filtered new table. ltable_filtered, rtable_filtered = _get_filtered_table( ltable, rtable, l_key, r_key, corres_list) # Select a subset of fields with high scores. feature_list = _select_features(ltable_filtered, rtable_filtered, l_key) # Map the record key value to its index in the table. lrecord_id_to_index_map = _get_record_id_to_index_map(ltable_filtered, l_key) rrecord_id_to_index_map = _get_record_id_to_index_map(rtable_filtered, r_key) # Build the tokenized record list delimited by a white space on the # selected fields. lrecord_list = _get_tokenized_table(ltable_filtered, l_key, feature_list) rrecord_list = _get_tokenized_table(rtable_filtered, r_key, feature_list) # Reformat the candidate set from a dataframe to a list of record index # tuple pair. new_formatted_candidate_set = _index_candidate_set( candset, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose) # Build the token order according to token's frequency. To run a # prefix filtering based similarity join algorithm, we first need # the global token order. order_dict = {} _build_global_token_order(lrecord_list, order_dict) _build_global_token_order(rrecord_list, order_dict) # Sort the tokens in each record by the global order. _sort_record_tokens_by_global_order(lrecord_list, order_dict) _sort_record_tokens_by_global_order(rrecord_list, order_dict) # Run the topk similarity join. topk_heap = _topk_sim_join( lrecord_list, rrecord_list, new_formatted_candidate_set, output_size) # Assemble the topk record list to a dataframe. ret_dataframe = _assemble_topk_table(topk_heap, ltable_filtered, rtable_filtered) return ret_dataframe
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_jobs=1): """Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ab = em.AttrEquivalenceBlocker() >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(l_df) * len(r_df)) if n_procs <= 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: # multiprocessing m, n = self.get_split_params(n_procs, len(l_df), len(r_df)) l_splits = np.array_split(l_df, m) r_splits = np.array_split(r_df, n) c_splits = Parallel(n_jobs=m * n)(delayed(_block_tables_split)( l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) for l in l_splits for r in r_splits) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value( ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_jobs=1): """ Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_jobs) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_jobs) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, n_jobs) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def test_validate_metadata_for_table_invalid_df(self): status = cm._validate_metadata_for_table(None, 'ID', 'table', None, False)
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)( l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def test_validate_metadata_for_table_key_notstring(self): A = pd.read_csv(path_a) status = cm._validate_metadata_for_table(A, 'zipcode', 'table', None, False)
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def debug_blocker(ltable, rtable, candidate_set, activate_reusing_module, use_new_topk, use_parallel, output_path, output_size=200, attr_corres=None, verbose=True): logging.info('\nstart blocking') total_start = time.time() print 'start time:', total_start # preprocessing_start = time.clock() # Basic checks. if len(ltable) == 0: raise StandardError('Error: ltable is empty!') if len(rtable) == 0: raise StandardError('Error: rtable is empty!') if output_size <= 0: raise StandardError('The input parameter: \'pred_list_size\'' ' is less than or equal to 0. Nothing needs' ' to be done!') print 'cand set size:', len(candidate_set) # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # Check the user input field correst list (if exists) and get the raw # version of our internal correst list. check_input_field_correspondence_list(ltable, rtable, attr_corres) corres_list = get_field_correspondence_list(ltable, rtable, l_key, r_key, attr_corres) # Build the (col_name: col_index) dict to speed up locating a field in # the schema. ltable_col_dict = build_col_name_index_dict(ltable) rtable_col_dict = build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider # string types for document concatenation. filter_corres_list(ltable, rtable, l_key, r_key, ltable_col_dict, rtable_col_dict, corres_list) print corres_list # Get field filtered new table. ltable_filtered, rtable_filtered = get_filtered_table( ltable, rtable, corres_list) print ltable_filtered.columns print rtable_filtered.columns feature_list = select_features(ltable_filtered, rtable_filtered, l_key, r_key) feature_index_list = [feature_list[i][0] for i in range(len(feature_list))] print feature_index_list if len(feature_list) == 0: raise StandardError('\nError: the selected field list is empty,' ' nothing could be done! Please check if all' ' table fields are numeric types.') print 'selected_fields:', ltable_filtered.columns[feature_index_list] lrecord_id_to_index_map = build_id_to_index_map(ltable_filtered, l_key) rrecord_id_to_index_map = build_id_to_index_map(rtable_filtered, r_key) print 'finish building id to index map' lrecord_index_to_id_map = build_index_to_id_map(ltable_filtered, l_key) rrecord_index_to_id_map = build_index_to_id_map(rtable_filtered, r_key) print 'finish building index to id map' lrecord_list = get_tokenized_table(ltable_filtered, l_key, feature_index_list) rrecord_list = get_tokenized_table(rtable_filtered, r_key, feature_index_list) print 'finish tokenizing tables' order_dict, token_index_dict = build_global_token_order( lrecord_list, rrecord_list) print 'finish building global order' replace_token_with_numeric_index(lrecord_list, order_dict) replace_token_with_numeric_index(rrecord_list, order_dict) print 'finish replacing tokens with numeric indices' sort_record_tokens_by_global_order(lrecord_list) sort_record_tokens_by_global_order(rrecord_list) print 'finish sorting record tokens' lrecord_token_list, lrecord_index_list, lrecord_field_list =\ split_record_token_and_index(lrecord_list, len(feature_list)) rrecord_token_list, rrecord_index_list, rrecord_field_list =\ split_record_token_and_index(rrecord_list, len(feature_list)) print 'finish splitting record token and index' del lrecord_list del rrecord_list new_formatted_candidate_set = index_candidate_set(candidate_set, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose) print 'finish reformating cand set' ltable_field_length_list = calc_table_field_length(lrecord_index_list, len(feature_list)) rtable_field_length_list = calc_table_field_length(rrecord_index_list, len(feature_list)) ltable_field_token_sum = calc_table_field_token_sum( ltable_field_length_list, len(feature_list)) rtable_field_token_sum = calc_table_field_token_sum( rtable_field_length_list, len(feature_list)) print ltable_field_token_sum print rtable_field_token_sum ltoken_sum = 0 rtoken_sum = 0 for i in range(len(ltable_field_token_sum)): ltoken_sum += ltable_field_token_sum[i] for i in range(len(rtable_field_token_sum)): rtoken_sum += rtable_field_token_sum[i] ltoken_ave = ltoken_sum * 1.0 / len(lrecord_token_list) rtoken_ave = rtoken_sum * 1.0 / len(rrecord_token_list) print ltoken_ave, rtoken_ave ltoken_ratio = [] rtoken_ratio = [] for i in range(len(ltable_field_token_sum)): ltoken_ratio.append(ltable_field_token_sum[i] * 1.0 / ltoken_sum) for i in range(len(rtable_field_token_sum)): rtoken_ratio.append(rtable_field_token_sum[i] * 1.0 / rtoken_sum) print ltoken_ratio print rtoken_ratio # tsum = 0 # tlist = [] # for i in xrange(len(ltable_field_token_sum)): # value = (ltable_field_token_sum[i] + rtable_field_token_sum[i]) \ # * 1.0 / (len(lrecord_token_list) + len(rrecord_token_list)) # tsum += value # tlist.append(value) # for i in xrange(len(tlist)): # tlist[i] /= tsum # print tlist debugblocker_cython(lrecord_token_list, rrecord_token_list, lrecord_index_list, rrecord_index_list, lrecord_field_list, rrecord_field_list, ltable_field_token_sum, rtable_field_token_sum, new_formatted_candidate_set, len(feature_list), output_size, output_path, activate_reusing_module, use_new_topk, use_parallel) total_end = time.time() total_time = total_end - total_start print 'total time:', total_time return total_time
def debug_blocker(candidate_set, ltable, rtable, output_size=200, attr_corres=None, verbose=True, n_jobs=1, n_configs=1): """ This function debugs the blocker output and reports a list of potential matches that are discarded by a blocker (or a blocker sequence). Specifically, this function takes in the two input tables for matching and the candidate set returned by a blocker (or a blocker sequence), and produces a list of tuple pairs which are rejected by the blocker but with high potential of being true matches. Args: candidate_set (DataFrame): The candidate set generated by applying the blocker on the ltable and rtable. ltable,rtable (DataFrame): The input DataFrames that are used to generate the blocker output. output_size (int): The number of tuple pairs that will be returned (defaults to 200). attr_corres (list): A list of attribute correspondence tuples. When ltable and rtable have different schemas, or the same schema but different words describing the attributes, the user needs to manually specify the attribute correspondence. Each element in this list should be a tuple of strings which are the corresponding attributes in ltable and rtable. The default value is None, and if the user doesn't specify this list, a built-in function for finding the attribute correspondence list will be called. But we highly recommend the users manually specify the attribute correspondences, unless the schemas of ltable and rtable are identical (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). n_configs (int): The maximum number of configs to be used for calculating the topk list(defaults to 1). If -1, the config number is set as the number of cpu. If -2, all configs are used. if n_configs is less than the maximum number of generated configs, then n_configs will be used. Otherwise, all the generated configs will be used. Returns: A pandas DataFrame with 'output_size' number of rows. Each row in the DataFrame is a tuple pair which has potential of being a true match, but is rejected by the blocker (meaning that the tuple pair is in the Cartesian product of ltable and rtable subtracted by the candidate set). The fields in the returned DataFrame are from ltable and rtable, which are useful for determining similar tuple pairs. Raises: AssertionError: If `ltable`, `rtable` or `candset` is not of type pandas DataFrame. AssertionError: If `ltable` or `rtable` is empty (size of 0). AssertionError: If the output `size` parameter is less than or equal to 0. AssertionError: If the attribute correspondence (`attr_corres`) list is not in the correct format (a list of tuples). AssertionError: If the attribute correspondence (`attr_corres`) cannot be built correctly. Examples: >>> import py_entitymatching as em >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, l_overlap_attr='title', r_overlap_attr='title', overlap_size=3) >>> corres = [('ID','ssn'), ('name', 'ename'), ('address', 'location'),('zipcode', 'zipcode')] >>> D = em.debug_blocker(C, A, B, attr_corres=corres) >>> import py_entitymatching as em >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, l_overlap_attr='name', r_overlap_attr='name', overlap_size=3) >>> D = em.debug_blocker(C, A, B, output_size=150) """ # Check input types. _validate_types(ltable, rtable, candidate_set, output_size, attr_corres, verbose) # Basic checks. # Check table size. if len(ltable) == 0: raise AssertionError('Error: ltable is empty!') if len(rtable) == 0: raise AssertionError('Error: rtable is empty!') # Check the value of output size. if output_size <= 0: raise AssertionError('The input parameter: \'pred_list_size\'' ' is less than or equal to 0. Nothing needs' ' to be done!') # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # Check the user input field correst list (if exists) and get the raw # version of our internal correst list. _check_input_field_correspondence_list(ltable, rtable, attr_corres) corres_list = _get_field_correspondence_list(ltable, rtable, l_key, r_key, attr_corres) # Build the (col_name: col_index) dict to speed up locating a field in # the schema. ltable_col_dict = _build_col_name_index_dict(ltable) rtable_col_dict = _build_col_name_index_dict(rtable) # Filter correspondence list to remove numeric types. We only consider # string types for document concatenation. _filter_corres_list(ltable, rtable, l_key, r_key, ltable_col_dict, rtable_col_dict, corres_list) # Get field filtered new table. ltable_filtered, rtable_filtered = _get_filtered_table( ltable, rtable, corres_list) # Select a subset of fields with high scores feature_list = _select_features(ltable_filtered, rtable_filtered, l_key, r_key) if len(feature_list) == 0: raise AssertionError('\nError: the selected field list is empty,' ' nothing could be done! Please check if all' ' table fields are numeric types.') # Map the record key value to its index in the table lrecord_id_to_index_map = _build_id_to_index_map(ltable_filtered, l_key) rrecord_id_to_index_map = _build_id_to_index_map(rtable_filtered, r_key) # Build the tokenized record list delimited by a white space on the # selected fields. lrecord_list = _get_tokenized_table(ltable_filtered, l_key, feature_list) rrecord_list = _get_tokenized_table(rtable_filtered, r_key, feature_list) # Build the token order according to token's frequency. To run a # prefix filtering based similarity join algorithm, we first need # the global token order. order_dict, token_index_dict = _build_global_token_order( lrecord_list, rrecord_list) # Sort the token in each record by the global order. _replace_token_with_numeric_index(lrecord_list, order_dict) _replace_token_with_numeric_index(rrecord_list, order_dict) _sort_record_tokens_by_global_order(lrecord_list) _sort_record_tokens_by_global_order(rrecord_list) lrecord_token_list, lrecord_index_list =\ _split_record_token_and_index(lrecord_list) rrecord_token_list, rrecord_index_list =\ _split_record_token_and_index(rrecord_list) del lrecord_list del rrecord_list # Reformat the candidate set from a dataframe to a list of record index # tuple pair. new_formatted_candidate_set = _index_candidate_set( candidate_set, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose) ltable_field_length_list = _calc_table_field_length( lrecord_index_list, len(feature_list)) rtable_field_length_list = _calc_table_field_length( rrecord_index_list, len(feature_list)) ltable_field_token_sum = _calc_table_field_token_sum( ltable_field_length_list, len(feature_list)) rtable_field_token_sum = _calc_table_field_token_sum( rtable_field_length_list, len(feature_list)) rec_list = debugblocker_cython_parallel( lrecord_token_list, rrecord_token_list, lrecord_index_list, rrecord_index_list, ltable_field_token_sum, rtable_field_token_sum, new_formatted_candidate_set, len(feature_list), output_size, n_jobs, n_configs) ret_dataframe = _assemble_topk_table(rec_list[0:output_size], ltable_filtered, rtable_filtered, l_key, r_key) return ret_dataframe
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset