def _index_candidate_set(candidate_set, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose): if len(candidate_set) == 0: return {} new_formatted_candidate_set = {} # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(candidate_set, logger, verbose) # Validate metadata cm._validate_metadata_for_candset(candidate_set, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) ltable_key_data = list(candidate_set[fk_ltable]) rtable_key_data = list(candidate_set[fk_rtable]) for i in range(len(ltable_key_data)): if ltable_key_data[i] in lrecord_id_to_index_map and \ rtable_key_data[i] in rrecord_id_to_index_map: l_key_data = lrecord_id_to_index_map[ltable_key_data[i]] r_key_data = rrecord_id_to_index_map[rtable_key_data[i]] if l_key_data in new_formatted_candidate_set: new_formatted_candidate_set[l_key_data].add(r_key_data) else: new_formatted_candidate_set[l_key_data] = {r_key_data} return new_formatted_candidate_set
def rename_col(df, old_col_name, new_col_name): new_df = df.rename(columns={old_col_name: new_col_name}) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(df, logger, False) if key == old_col_name: cm.set_key(new_df, new_col_name) elif fk_ltable == old_col_name: cm.set_fk_ltable(new_df, new_col_name) elif fk_rtable == old_col_name: cm.set_fk_rtable(new_df, new_col_name) else: pass else: key = cm.get_key(df) if key == old_col_name: cm.set_key(new_df, new_col_name) return new_df
def _predict_candset(self, candset, verbose=False): # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # # keep track of predictions predictions = [] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get the index of fk_ltable and fk_rtable from the cand. set col_names = list(candset.columns) lid_idx = col_names.index(fk_ltable) rid_idx = col_names.index(fk_rtable) # # iterate through the cand. set for row in candset.itertuples(index=False): l_row = l_df.ix[row[lid_idx]] r_row = r_df.ix[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res is True: predictions.append(1) else: predictions.append(0) return predictions
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) if validate: cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) if validate: cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def rename_col(df, old_col_name, new_col_name): new_df = df.rename(columns={old_col_name: new_col_name}) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(df, logger, False) if key == old_col_name: cm.set_key(new_df, new_col_name) elif fk_ltable == old_col_name: cm.set_fk_ltable(new_df, new_col_name) elif fk_rtable == old_col_name: cm.set_fk_rtable(new_df, new_col_name) else: pass else: key = cm.get_key(df) if key == old_col_name: cm.set_key(new_df, new_col_name) return new_df
def _predict_candset(self, candset, verbose=False): # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # # keep track of predictions predictions = [] # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get the index of fk_ltable and fk_rtable from the cand. set col_names = list(candset.columns) lid_idx = col_names.index(fk_ltable) rid_idx = col_names.index(fk_rtable) # # iterate through the cand. set for row in candset.itertuples(index=False): l_row = l_df.ix[row[lid_idx]] r_row = r_df.ix[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res is True: predictions.append(1) else: predictions.append(0) return predictions
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def test_get_metadata_for_candset_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(C, None, False) self.assertEqual(key, '_id') self.assertEqual(fk_ltable, 'ltable_ID') self.assertEqual(fk_rtable, 'rtable_ID') self.assertEqual(l_key, 'ID') self.assertEqual(r_key, 'ID') self.assertEqual(ltable.equals(A), True) self.assertEqual(rtable.equals(B), True)
def execute(self, input_table, label_column, inplace=True, verbose=False): # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( input_table, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(input_table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' assert label_column in input_table.columns, 'Label column not in the input table' if inplace == False: table = input_table.copy() else: table = input_table # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(l_key, drop=False) r_tbl = rtable.set_index(r_key, drop=False) # keep track of valid ids y = [] column_names = list(input_table.columns) lid_idx = column_names.index(l_key) rid_idx = column_names.index(r_key) id_idx = column_names.index(key) label_idx = column_names.index(label_column) test_idx = 0 idx = 0 for row in input_table.itertuples(index=False): if row[label_idx] != self.value_to_set: l_row = l_tbl.ix[row[lid_idx]] r_row = r_tbl.ix[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res == self.cond_status: table.iat[idx, label_idx] = self.value_to_set idx += 1 return table
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input object is not of type data frame') raise AssertionError('Input object is not of type data frame') # # The label column name is expected to be of type string if not isinstance(label_column_name, six.string_types): logger.error('Input attr. is not of type string') raise AssertionError('Input attr. is not of type string') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error( 'The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError( 'The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def preserve_metadata(df, new_df): if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) if not ch.check_attrs_present(new_df, [key, fk_ltable, fk_rtable]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df else: key = cm.get_key(df) if not ch.check_attrs_present(new_df, [key]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def drop_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable]) col_list = gh.list_drop_duplicates(col_list) else: key = cm.get_key(df) col_list = gh.list_diff(col_list, [key]) col_list = gh.list_drop_duplicates(col_list) new_df = df.drop(col_list, axis=1) cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def drop_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable]) col_list = gh.list_drop_duplicates(col_list) else: key = cm.get_key(df) col_list = gh.list_diff(col_list, [key]) col_list = gh.list_drop_duplicates(col_list) new_df = df.drop(col_list, axis=1) cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def preserve_metadata(df, new_df): if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) if not ch.check_attrs_present(new_df, [key, fk_ltable, fk_rtable]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df else: key = cm.get_key(df) if not ch.check_attrs_present(new_df, [key]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def _index_candidate_set(candidate_set, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose): new_formatted_candidate_set = set() if len(candidate_set) == 0: return new_formatted_candidate_set # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key =\ cm.get_metadata_for_candset(candidate_set, logger, verbose) # validate metadata cm._validate_metadata_for_candset(candidate_set, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) ltable_key_data = list(candidate_set[fk_ltable]) rtable_key_data = list(candidate_set[fk_rtable]) for i in range(len(ltable_key_data)): new_formatted_candidate_set.add((lrecord_id_to_index_map[ltable_key_data[i]], rrecord_id_to_index_map[rtable_key_data[i]])) return new_formatted_candidate_set
def _index_candidate_set(candidate_set, lrecord_id_to_index_map, rrecord_id_to_index_map, verbose): new_formatted_candidate_set = set() if len(candidate_set) == 0: return new_formatted_candidate_set # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key =\ cm.get_metadata_for_candset(candidate_set, logger, verbose) # validate metadata cm._validate_metadata_for_candset(candidate_set, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) ltable_key_data = list(candidate_set[fk_ltable]) rtable_key_data = list(candidate_set[fk_rtable]) for i in range(len(ltable_key_data)): new_formatted_candidate_set.add((lrecord_id_to_index_map[ltable_key_data[i]], rrecord_id_to_index_map[rtable_key_data[i]])) return new_formatted_candidate_set
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # # The label column name is expected to be of type string validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error('The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError('The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def project_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) updated_col_list = [key, fk_ltable, fk_rtable] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) else: key = cm.get_key(df) updated_col_list = [key] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) new_df = df[col_list] cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def project_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) updated_col_list = [key, fk_ltable, fk_rtable] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) else: key = cm.get_key(df) updated_col_list = [key] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) new_df = df[col_list] cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def extract_from(self, candset): # Get metadata for candidate set key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(candset, logger, self.verbose) # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # Apply feature functions ch.log_info(logger, 'Applying feature functions', self.verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(self.n_jobs, len(candset)) c_splits = np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(self.feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)( delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_split, self.show_progress and i == len(c_splits) - 1 ) for i, c_split in enumerate(c_splits) ) feat_vals = sum(feat_vals_by_splits, []) return feat_vals
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information
 should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def split_train_test(labeled_data, train_proportion=0.5, random_state=None, verbose=True): """ This function splits the input data into train and test. Specifically, this function is just a wrapper of scikit-learn's train_test_split function. This function also takes care of copying the metadata from the input table to train and test splits. Args: labeled_data (DataFrame): The input pandas DataFrame that needs to be split into train and test. train_proportion (float): A number between 0 and 1, indicating the proportion of tuples that should be included in the train split ( defaults to 0.5). random_state (object): A number of random number object (as in scikit-learn). verbose (boolean): A flag to indicate whether the debug information should be displayed. Returns: A Python dictionary containing two keys - train and test. The value for the key 'train' is a pandas DataFrame containing tuples allocated from the input table based on train_proportion. Similarly, the value for the key 'test' is a pandas DataFrame containing tuples for evaluation. This function sets the output DataFrames (train, test) properties same as the input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data or the feature vectors that should be split >>> train_test = em.split_train_test(G, train_proportion=0.5) >>> train, test = train_test['train'], train_test['test'] """ # Validate input parameters # # We expected labeled data to be of type pandas DataFrame if not isinstance(labeled_data, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( labeled_data, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) num_rows = len(labeled_data) # We expect the train proportion to be between 0 and 1. assert train_proportion >= 0 and train_proportion <= 1, \ " Train proportion is expected to be between 0 and 1" # We expect the number of rows in the table to be non-empty assert num_rows > 0, 'The input table is empty' # Explicitly get the train and test size in terms of tuples (based on the # given proportion) train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) # Use sk-learn to split the data idx_values = pd.np.array(labeled_data.index.values) idx_train, idx_test = ms.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # Construct output tables. label_train = labeled_data.ix[idx_train] label_test = labeled_data.ix[idx_test] # Update catalog cm.init_properties(label_train) cm.copy_properties(labeled_data, label_train) cm.init_properties(label_test) cm.copy_properties(labeled_data, label_test) # Return output tables result = OrderedDict() result['train'] = label_train result['test'] = label_test # Finally, return the dictionary. return result
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def predict(self, table=None, target_attr=None, append=False, inplace=True): """Predict interface for the matcher. A point to note is all the input parameters have a default value of None. Args: table (DataFrame): The input candidate set of type pandas DataFrame containing tuple pairs (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). Returns: An array of predictions or a DataFrame with predictions updated. Examples: >>> import py_entitymatching as em >>> brm = em.BooleanRuleMatcher() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> match_f = em.get_features_for_matching(A, B) >>> rule = ['address_address_lev(ltuple, rtuple) > 6'] >>> brm.add_rule(rule, match_f) >>> # The table S is a cand set generated by the blocking and then labeling phases >>> brm.predict(S, target_attr='pred_label', append=True) """ # Validate input parameters # # We expect the table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame, 'Input table') # # We expect the target_attr to be of type string if not None if target_attr is not None and not isinstance(target_attr, str): logger.error('Input target_attr must be a string.') raise AssertionError('Input target_attr must be a string.') # # We expect the append to be of type boolean validate_object_type(append, bool, 'Input append') # # We expect the inplace to be of type boolean validate_object_type(inplace, bool, 'Input inplace') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( table, logger, False) # # validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, False) # Validate that there are some rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # Parse conjuncts to validate that the features are in the feature table for rule in self.rule_conjunct_list: for conjunct in self.rule_conjunct_list[rule]: parse_conjunct(conjunct, self.rule_ft[rule]) if table is not None: y = self._predict_candset(table) if target_attr is not None and append is True: if inplace == True: table[target_attr] = y return table else: tbl = table.copy() tbl[target_attr] = y return tbl else: return y else: raise SyntaxError('The arguments supplied does not match the signatures supported !!!')
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, verbose=True, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(c_df)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_procs <= 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_procs) valid_splits = Parallel(n_jobs=n_procs)(delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information
 should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = em.OverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_jobs=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input parameters specific to overlap blocker self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # do blocking # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) # # cleanup the tables from non-ascii characters, punctuations, and stop words self.cleanup_table(l_df, l_overlap_attr, rem_stop_words) self.cleanup_table(r_df, r_overlap_attr, rem_stop_words) # # determine which tokenizer to use if word_level == True: # # # create a whitespace tokenizer tokenizer = WhitespaceTokenizer(return_set=True) else: # # # create a qgram tokenizer tokenizer = QgramTokenizer(qval=q_val, return_set=True) # # create a filter for overlap similarity join overlap_filter = OverlapFilter(tokenizer, overlap_size, allow_missing=allow_missing) # # perform overlap similarity filtering of the candset out_table = overlap_filter.filter_candset(candset, fk_ltable, fk_rtable, l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, n_jobs, show_progress=show_progress) # update catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return out_table
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information
 should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def predict(self, table=None, target_attr=None, append=False, inplace=True): """Predict interface for the matcher. A point to note is all the input parameters have a default value of None. Args: table (DataFrame): The input candidate set of type pandas DataFrame containing tuple pairs (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). Returns: An array of predictions or a DataFrame with predictions updated. Examples: >>> import py_entitymatching as em >>> brm = em.BooleanRuleMatcher() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> match_f = em.get_features_for_matching(A, B) >>> rule = ['address_address_lev(ltuple, rtuple) > 6'] >>> brm.add_rule(rule, match_f) >>> # The table S is a cand set generated by the blocking and then labeling phases >>> brm.predict(S, target_attr='pred_label', append=True) """ # Validate input parameters # # We expect the table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame, 'Input table') # # We expect the target_attr to be of type string if not None if target_attr is not None and not isinstance(target_attr, str): logger.error('Input target_attr must be a string.') raise AssertionError('Input target_attr must be a string.') # # We expect the append to be of type boolean validate_object_type(append, bool, 'Input append') # # We expect the inplace to be of type boolean validate_object_type(inplace, bool, 'Input inplace') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( table, logger, False) # # validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, False) # Validate that there are some rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # Parse conjuncts to validate that the features are in the feature table for rule in self.rule_conjunct_list: for conjunct in self.rule_conjunct_list[rule]: parse_conjunct(conjunct, self.rule_ft[rule]) if table is not None: y = self._predict_candset(table) if target_attr is not None and append is True: if inplace == True: table[target_attr] = y return table else: tbl = table.copy() tbl[target_attr] = y return tbl else: return y else: raise SyntaxError( 'The arguments supplied does not match the signatures supported !!!' )
def block_candset(self, candset, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, allow_missing=False, verbose=False, show_progress=True, n_chunks=-1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on the overlap of token sets of attribute values. Finds tuple pairs from an input candidate set of tuple pairs such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of the left tuple in a tuple pair, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of the right tuple in the tuple pair, is above a certain threshold. Args: candset (DataFrame): The input candidate set of tuple pairs. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information
 should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() >>> C = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ob.block_candset(C, 'name', 'name', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ob.block_candset(C, 'name', 'name', n_chunks=-1) # Use q-gram tokenizer >>> D2 = ob.block_candset(C, 'name', 'name', word_level=False, q_val=2) """ logger.warning( "WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Validate input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate overlap attrs self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) # validate word_level and q_val self.validate_word_level_qval(word_level, q_val) # validate number of chunks validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # # do projection before merge l_df = ltable[[l_key, l_overlap_attr]] r_df = rtable[[r_key, r_overlap_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # case the overlap attribute to string if required. l_df.is_copy, r_df.is_copy = False, False # to avoid setwithcopy warning ssj.dataframe_column_to_str(l_df, l_overlap_attr, inplace=True) ssj.dataframe_column_to_str(r_df, r_overlap_attr, inplace=True) if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(return_set=True) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = pd.np.array_split(candset, n_chunks) valid_splits = [] # Create DAG for i in range(n_chunks): result = delayed(self._block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_overlap_attr, r_overlap_attr, fk_ltable, fk_rtable, allow_missing, rem_stop_words, tokenizer, overlap_size) valid_splits.append(result) valid_splits = delayed(wrap)(valid_splits) # Execute the DAG if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def test_get_metadata_for_candset_invalid_df(self): cm.get_metadata_for_candset(None, None, False)
def block_candset(self, candset, verbose=False, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # initialize the progress bar if show_progress: bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project(l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_jobs) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def impute_table(table, exclude_attrs=None, missing_val='NaN', strategy='mean', axis=0, val_all_nans=0, verbose=True): """ Impute table containing missing values. Args: table (DataFrame): DataFrame which values should be imputed. exclude_attrs (List) : list of attribute names to be excluded from imputing (defaults to None). missing_val (string or int): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value 'NaN' (defaults to 'NaN'). strategy (string): String that specifies on how to impute values. Valid strings: 'mean', 'median', 'most_frequent' (defaults to 'mean'). axis (int): axis=1 along rows, and axis=0 along columns (defaults to 0). val_all_nans (float): Value to fill in if all the values in the column are NaN. Returns: Imputed DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # H is the feature vector which should be imputed. Specifically, impute the missing values >>> # in each column, with the mean of that column >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') """ # Validate input paramaters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) fv_columns = table.columns if exclude_attrs == None: feature_names = fv_columns else: # Check if the exclude attributes are present in the input table if not ch.check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] # print feature_names table_copy = table.copy() projected_table = table_copy[feature_names] projected_table_values = projected_table.values imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis) imp.fit(projected_table_values) imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans projected_table_values = imp.transform(projected_table_values) table_copy[feature_names] = projected_table_values # Update catalog cm.init_properties(table_copy) cm.copy_properties(table, table_copy) return table_copy
def block_candset(self, candset, verbose=True, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker import DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)( c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, False) valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute( scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_jobs=1): """Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(candset)) valid = [] if n_procs <= 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = pd.np.array_split(candset, n_procs) valid_splits = Parallel(n_jobs=n_procs)( delayed(_block_candset_split) (c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def block_candset(self, candset, verbose=False, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus are the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> rb = em.RuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # do blocking # # initialize the progress bar if show_progress: bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_jobs) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def execute(self, input_table, label_column, inplace=True, verbose=False): """ Executes the rules of the match trigger for a table of matcher results. Args: input_table (DataFrame): The input table of type pandas DataFrame containing tuple pairs and labels from matching (defaults to None). label_column (string): The attribute name where the predictions are stored in the input table (defaults to None). inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). Returns: A DataFrame with predictions updated. Examples: >>> import py_entitymatching as em >>> mt = em.MatchTrigger() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> match_f = em.get_features_for_matching(A, B) >>> rule = ['title_title_lev_sim(ltuple, rtuple) > 0.7'] >>> mt.add_cond_rule(rule, match_f) >>> mt.add_cond_status(True) >>> mt.add_action(1) >>> # The table H is a table with prediction labels generated from matching >>> mt.execute(input_table=H, label_column='predicted_labels', inplace=False) """ # Validate input parameters # # We expect the table to be of type pandas DataFrame validate_object_type(input_table, pd.DataFrame, 'Input table') # # We expect the target_attr to be of type string if not None if label_column is not None and not isinstance(label_column, str): logger.error('Input target_attr must be a string.') raise AssertionError('Input target_attr must be a string.') # # We expect the inplace to be of type boolean validate_object_type(inplace, bool, 'Input inplace') # # We expect the verbose to be of type boolean validate_object_type(verbose, bool, 'Input append') # Validate that there are some rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( input_table, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(input_table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' assert label_column in input_table.columns, 'Label column not in the input table' # Parse conjuncts to validate that the features are in the feature table for rule in self.rule_conjunct_list: for conjunct in self.rule_conjunct_list[rule]: parse_conjunct(conjunct, self.rule_ft[rule]) if inplace == False: table = input_table.copy() else: table = input_table # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(l_key, drop=False) r_tbl = rtable.set_index(r_key, drop=False) # keep track of valid ids y = [] column_names = list(input_table.columns) lid_idx = column_names.index(fk_ltable) rid_idx = column_names.index(fk_rtable) label_idx = column_names.index(label_column) idx = 0 for row in input_table.itertuples(index=False): if row[label_idx] != self.value_to_set: l_row = l_tbl.loc[row[lid_idx]] r_row = r_tbl.loc[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res == self.cond_status: table.iat[idx, label_idx] = self.value_to_set idx += 1 return table
def block_candset(self, candset, verbose=True, show_progress=True, n_jobs=1): """ Blocks an input candidate set of tuple pairs based on a black box blocking function specified by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether logging should be done (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine).Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_jobs` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> bb = em.BlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> D = bb.block_candset(C) # C is an output from block_tables """ # validate data types of standard input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_jobs) # validate black box functionn assert self.black_box_function != None, 'Black box function is not set' # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # project candset to keep only the ID attributes c_df = candset[[key, fk_ltable, fk_rtable]] # # determine the number of processes to launch parallely n_procs = self.get_num_procs(n_jobs, len(c_df)) # # pickle the black-box function before passing it as an arg to # # _block_candset_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) valid = [] if n_procs <= 1: # single process valid = _block_candset_split(c_df, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress) else: # multiprocessing c_splits = pd.np.array_split(c_df, n_procs) valid_splits = Parallel(n_jobs=n_procs)(delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, black_box_function_pkl, show_progress and i == len(c_splits) - 1) for i in range(len(c_splits))) valid = sum(valid_splits, []) # construct output table if len(c_df) > 0: c_df = candset[valid] else: c_df = pd.DataFrame(columns=candset.columns) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df