def add_key_column(table, key): validate_object_type(table, pd.DataFrame) validate_object_type(key, six.string_types, error_prefix='Input key') table.insert(0, key, range(0, len(table))) return table
def validate_types_other_params(self, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size): validate_object_type( l_overlap_attr, six.string_types, error_prefix='Overlap attribute name of left table') validate_object_type( r_overlap_attr, six.string_types, error_prefix='Overlap attribute name of right table') validate_object_type(rem_stop_words, bool, error_prefix='Parameter rem_stop_words') if q_val != None and not isinstance(q_val, int): logger.error('Parameter q_val is not of type int') raise AssertionError('Parameter q_val is not of type int') validate_object_type(word_level, bool, error_prefix='Parameter word_level') validate_object_type(overlap_size, int, error_prefix='Parameter overlap_size')
def does_contain_missing_vals(df, attr): """ Check if the attribute contains missing values in the dataframe Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe Returns: result (bool). Returns True, if the attribute contains missing values, else returns False Notes: This is an internal helper function """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') # nan_flag = (sum(df[attr].isnull()) != 0) nan_flag = any(pd.isnull(df[attr])) if not nan_flag: return False else: return True
def does_contain_missing_vals(df, attr): """ Check if the attribute contains missing values in the dataframe Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe Returns: result (bool). Returns True, if the attribute contains missing values, else returns False Notes: This is an internal helper function """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') # nan_flag = (sum(df[attr].isnull()) != 0) nan_flag = any(pd.isnull(df[attr])) if not nan_flag: return False else: return True
def add_key_column(table, key): validate_object_type(table, pd.DataFrame) validate_object_type(key, six.string_types, error_prefix='Input key') table.insert(0, key, range(0, len(table))) return table
def test_validate_object_type_with_invalid_type(self): self.assertRaises(AssertionError, lambda: vh.validate_object_type('ABC', int)) self.assertRaises(AssertionError, lambda: vh.validate_object_type(123, str)) self.assertRaises(AssertionError, lambda: vh.validate_object_type(list(), dict)) self.assertRaises(AssertionError, lambda: vh.validate_object_type(dict(), list))
def validate_types_block_attrs(self, l_block_attr, r_block_attr): validate_object_type( l_block_attr, six.string_types, error_prefix='Blocking attribute name of left table') validate_object_type( r_block_attr, six.string_types, error_prefix='Blocking attribute name of right table')
def check_attrs_present(table, attrs): validate_object_type(table, pd.DataFrame) if attrs is None: logger.warning('Input attr. list is null') return False if isinstance(attrs, list) is False: attrs = [attrs] status = are_all_attrs_in_df(table, attrs, verbose=True) return status
def check_attrs_present(table, attrs): validate_object_type(table, pd.DataFrame) if attrs is None: logger.warning('Input attr. list is null') return False if isinstance(attrs, list) is False: attrs = [attrs] status = are_all_attrs_in_df(table, attrs, verbose=True) return status
def debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs=None): """ This function is used to debug a decision tree matcher using two input tuples. Specifically, this function takes in two tuples, gets the feature vector using the feature table and finally passes it to the decision tree and displays the path that the feature vector takes in the decision tree. Args: decision_tree (DTMatcher): The input decision tree object that should be debugged. tuple_1,tuple_2 (Series): Input tuples that should be debugged. feature_table (DataFrame): Feature table containing the functions for the features. table_columns (list): List of all columns that will be outputted after generation of feature vectors. exclude_attrs (list): List of attributes that should be removed from the table columns. Raises: AssertionError: If the input feature table is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # devel is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # F is the feature vector got from evaluation set of the labeled data. >>> out = dt.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # A and B are input tables >>> em.debug_decisiontree_matcher(dt, A.loc[1], B.loc[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') """ validate_object_type(feature_table, pd.DataFrame, 'The input feature table') _debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs, ensemble_flag=False)
def _validate_types(ltable, rtable, candidate_set, output_size, attr_corres, verbose): validate_object_type(ltable, pd.DataFrame, 'Input left table') validate_object_type(rtable, pd.DataFrame, 'Input right table') validate_object_type(candidate_set, pd.DataFrame, 'Input candidate set') validate_object_type(output_size, int, 'Output size') if attr_corres is not None: if not isinstance(attr_corres, list): logging.error('Input attribute correspondence is not of' ' type list') raise AssertionError('Input attribute correspondence is' ' not of type list') for pair in attr_corres: if not isinstance(pair, tuple): logging.error('Pair in attribute correspondence list is not' ' of type tuple') raise AssertionError('Pair in attribute correspondence list' ' is not of type tuple') if not isinstance(verbose, bool): logger.error('Parameter verbose is not of type bool') raise AssertionError('Parameter verbose is not of type bool')
def validate_types_other_params(self, l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size): validate_object_type(l_overlap_attr, six.string_types, error_prefix='Overlap attribute name of left table') validate_object_type(r_overlap_attr, six.string_types, error_prefix='Overlap attribute name of right table') validate_object_type(rem_stop_words, bool, error_prefix='Parameter rem_stop_words') if q_val != None and not isinstance(q_val, int): logger.error('Parameter q_val is not of type int') raise AssertionError('Parameter q_val is not of type int') validate_object_type(word_level, bool, error_prefix='Parameter word_level') validate_object_type(overlap_size, int, error_prefix='Parameter overlap_size')
def _validate_types(ltable, rtable, candidate_set, output_size, attr_corres, verbose): validate_object_type(ltable, pd.DataFrame, 'Input left table') validate_object_type(rtable, pd.DataFrame, 'Input right table') validate_object_type(candidate_set, pd.DataFrame, 'Input candidate set') validate_object_type(output_size, int, 'Output size') if attr_corres is not None: if not isinstance(attr_corres, list): logging.error('Input attribute correspondence is not of' ' type list') raise AssertionError('Input attribute correspondence is' ' not of type list') for pair in attr_corres: if not isinstance(pair, tuple): logging.error('Pair in attribute correspondence list is not' ' of type tuple') raise AssertionError('Pair in attribute correspondence list' ' is not of type tuple') if not isinstance(verbose, bool): logger.error('Parameter verbose is not of type bool') raise AssertionError('Parameter verbose is not of type bool')
def are_all_attrs_in_df(df, col_names, verbose=False): validate_object_type(df, pd.DataFrame) if col_names is None: logger.warning('Input col_names is null') return False df_columns_names = list(df.columns) for c in col_names: if c not in df_columns_names: if verbose: logger.warning('Column name (' +c+ ') is not present in dataframe') return False return True
def are_all_attrs_in_df(df, col_names, verbose=False): validate_object_type(df, pd.DataFrame) if col_names is None: logger.warning('Input col_names is null') return False df_columns_names = list(df.columns) for c in col_names: if c not in df_columns_names: if verbose: logger.warning('Column name (' + c + ') is not present in dataframe') return False return True
def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres): """ Check whether the order of tables matches with what is mentioned in l_attr_types, r_attr_type and attr_corres. """ # Validate the input parameters # We expect the input object ltable to be of type pandas DataFrame validate_object_type(ltable, pd.DataFrame, 'Input left table') # # We expect the rtable to be of type pandas DataFrame validate_object_type(rtable, pd.DataFrame, 'Input right table') # Get the ids of the input tables. This is used to validate the order # of tables present in the given data structures. # Note: This kind of checking is bit too aggressive, the reason is this # checking needs the ltable and rtable to point to exact memory location # across the given dictionaries and the input. Ideally, we just need to # check whether the contents of those DataFrames are same. ltable_id = id(ltable) rtable_id = id(rtable) # Check whether ltable id matches with id of table mentioned in l_attr_types if ltable_id != id(l_attr_types['_table']): logger.error( 'ltable is not the same as table mentioned in left attr types') return False # Check whether rtable id matches with id of table mentioned in r_attr_types if rtable_id != id(r_attr_types['_table']): logger.error( 'rtable is not the same as table mentioned in right attr types') return False # Check whether ltable matches with ltable mentioned in attr_corres if ltable_id != id(attr_corres['ltable']): logger.error( 'ltable is not the same as table mentioned in attr correspondence') return False # Check whether rtable matches with rtable mentioned in attr_corres if rtable_id != id(attr_corres['rtable']): logger.error( 'rtable is not the same as table mentioned in attr correspondence') return False # Finally, return True. return True
def _check_table_order(ltable, rtable, l_attr_types, r_attr_types, attr_corres): """ Check whether the order of tables matches with what is mentioned in l_attr_types, r_attr_type and attr_corres. """ # Validate the input parameters # We expect the input object ltable to be of type pandas DataFrame validate_object_type(ltable, pd.DataFrame, 'Input left table') # # We expect the rtable to be of type pandas DataFrame validate_object_type(rtable, pd.DataFrame, 'Input right table') # Get the ids of the input tables. This is used to validate the order # of tables present in the given data structures. # Note: This kind of checking is bit too aggressive, the reason is this # checking needs the ltable and rtable to point to exact memory location # across the given dictionaries and the input. Ideally, we just need to # check whether the contents of those DataFrames are same. ltable_id = id(ltable) rtable_id = id(rtable) # Check whether ltable id matches with id of table mentioned in l_attr_types if ltable_id != id(l_attr_types['_table']): logger.error( 'ltable is not the same as table mentioned in left attr types') return False # Check whether rtable id matches with id of table mentioned in r_attr_types if rtable_id != id(r_attr_types['_table']): logger.error( 'rtable is not the same as table mentioned in right attr types') return False # Check whether ltable matches with ltable mentioned in attr_corres if ltable_id != id(attr_corres['ltable']): logger.error( 'ltable is not the same as table mentioned in attr correspondence') return False # Check whether rtable matches with rtable mentioned in attr_corres if rtable_id != id(attr_corres['rtable']): logger.error( 'rtable is not the same as table mentioned in attr correspondence') return False # Finally, return True. return True
def debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs=None): """ This function is used to debug a decision tree matcher using two input tuples. Specifically, this function takes in two tuples, gets the feature vector using the feature table and finally passes it to the decision tree and displays the path that the feature vector takes in the decision tree. Args: decision_tree (DTMatcher): The input decision tree object that should be debugged. tuple_1,tuple_2 (Series): Input tuples that should be debugged. feature_table (DataFrame): Feature table containing the functions for the features. table_columns (list): List of all columns that will be outputted after generation of feature vectors. exclude_attrs (list): List of attributes that should be removed from the table columns. Raises: AssertionError: If the input feature table is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # devel is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # F is the feature vector got from evaluation set of the labeled data. >>> out = dt.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # A and B are input tables >>> em.debug_decisiontree_matcher(dt, A.ix[1], B.ix[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') """ validate_object_type(feature_table, pd.DataFrame, 'The input feature table') _debug_decisiontree_matcher(decision_tree, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs, ensemble_flag=False)
def load_object(file_path): """ Loads a Python object from disk. This function loads py_entitymatching objects from disk such as blockers, matchers, feature table, etc. Args: file_path (string): The file path to load the object from. Returns: A Python object read from the file path. Raises: AssertionError: If `file_path` is not of type string. AssertionError: If a file does not exist at the given `file_path`. Examples: >>> rb = em.load_object('./rule_blocker.pkl') See Also: :meth:`~save_object` """ # Validate input parameters validate_object_type(file_path, six.string_types, error_prefix='Input file path') # Check if a file exists at the given file path. if not os.path.exists(file_path): logger.error('File does not exist at path %s', file_path) raise AssertionError('File does not exist at path', file_path) # Read the object from the file. # # Open the file with the mode set to binary. with open(file_path, 'rb') as file_handler: object_to_return = pickle.load(file_handler) # Return the object. return object_to_return
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def check_fk_constraint(df_foreign, attr_foreign, df_base, attr_base): """ Check if the foreign key is a primary key Args: df_foreign (pandas dataframe): Foreign dataframe attr_foreign (str): Attribute in the foreign dataframe df_base (pandas dataframe): Base dataframe attr_base (str): Attribute in the base dataframe Returns: result (bool). Returns True if the foreign key contraint is satisfied, else returns False Notes: This is an internal helper function """ validate_object_type(df_foreign, pd.DataFrame, error_prefix='Input object (df_foreign)') validate_object_type(attr_foreign, six.string_types, error_prefix='Input attr (attr_foreign)') validate_object_type(df_base, pd.DataFrame, error_prefix='Input object (df_base)') validate_object_type(attr_base, six.string_types, error_prefix='Input attr (attr_base)') if not check_attrs_present(df_base, attr_base): logger.warning('The attribute %s is not in df_base' %attr_base) return False if not check_attrs_present(df_foreign, attr_foreign): logger.error('Input attr (attr_foreign) is not in df_foreign') return False if any(pd.isnull(df_foreign[attr_foreign])): logger.warning('The attribute %s in foreign table contains null values' %attr_foreign) return False uniq_fk_vals = set(pd.unique(df_foreign[attr_foreign])) base_attr_vals = df_base[attr_base].values d = uniq_fk_vals.difference(base_attr_vals) if len(d) > 0: logger.warning('For some attr. values in (%s) in the foreign table there are no values in ' '(%s) in the base table' %(attr_foreign, attr_base)) return False # check whether those values are unique in the base table. t = df_base[df_base[attr_base].isin(pd.unique(df_foreign[attr_foreign]))] status = is_key_attribute(t, attr_base) if status == False: logger.warning('Key attr. constraint for the subset of values (derived from. %s)' 'in %s is not satisifed' %(attr_foreign, attr_base)) return False else: return True
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # # The label column name is expected to be of type string validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error( 'The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError( 'The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def data_explore_pandastable(df): """ Wrapper function for pandastable. Gives user a GUI to examine and edit the dataframe passed in using pandastable. Args: df (Dataframe): The pandas dataframe to be explored with pandastable. Raises: AssertionError: If `df` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') >>> em.data_explore_pandastable(A) """ # Validate input parameters # # We expect the df to be of type pandas DataFrame validate_object_type(df, pd.DataFrame, 'Input df') DataExplorePandastable(df)
def is_key_attribute(df, attr, verbose=False): """ Check if an attribute is a key attribute Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe verbose (bool): Flag to indicate whether warnings should be printed out Returns: result (bool). Returns True, if the attribute is a key attribute (unique and without missing values), else returns False """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') # check if the length is > 0 if len(df) > 0: # check for uniqueness uniq_flag = len(pd.unique(df[attr])) == len(df) if not uniq_flag: if verbose: logger.warning('Attribute ' + attr + ' does not contain unique values') return False # check if there are missing or null values # nan_flag = sum(df[attr].isnull()) == 0 nan_flag = not any(pd.isnull(df[attr])) if not nan_flag: if verbose: logger.warning('Attribute ' + attr + ' contains missing values') return False return uniq_flag and nan_flag else: return True
def _validate_inputs(table, label_column_name, verbose): """ This function validates the inputs for the label_table function """ # Validate the input parameters # # The input table table is expected to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # # The label column name is expected to be of type string validate_object_type(label_column_name, six.string_types, error_prefix='Input attr.') # # Check if the label column name is already present in the input table if ch.check_attrs_present(table, label_column_name): logger.error('The label column name (%s) is already present in the ' 'input table', label_column_name) raise AssertionError('The label column name (%s) is already present ' 'in the input table', label_column_name) # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Return True if everything was successful return True
def is_key_attribute(df, attr, verbose=False): """ Check if an attribute is a key attribute Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe verbose (bool): Flag to indicate whether warnings should be printed out Returns: result (bool). Returns True, if the attribute is a key attribute (unique and without missing values), else returns False """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') # check if the length is > 0 if len(df) > 0: # check for uniqueness uniq_flag = len(pd.unique(df[attr])) == len(df) if not uniq_flag: if verbose: logger.warning('Attribute ' + attr + ' does not contain unique values') return False # check if there are missing or null values # nan_flag = sum(df[attr].isnull()) == 0 nan_flag = not any(pd.isnull(df[attr])) if not nan_flag: if verbose: logger.warning('Attribute ' + attr + ' contains missing values') return False return uniq_flag and nan_flag else: return True
def data_explore_openrefine(df, server='http://127.0.0.1:3333', name=None): """ Wrapper function for using OpenRefine. Gives user a GUI to examine and edit the dataframe passed in using OpenRefine. Args: df (Dataframe): The pandas dataframe to be explored with pandastable. server (String): The address of the OpenRefine server (defaults to http://127.0.0.1:3333). name (String): The name given to the file and project in OpenRefine. Raises: AssertionError: If `df` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID') >>> em.data_explore_openrefine(A, name='Table') """ # Validate input parameters # # We expect the df to be of type pandas DataFrame validate_object_type(df, pd.DataFrame, 'Input df') return DataExploreOpenRefine(df, server, name)
def is_attr_unique(df, attr): """ Check if the attribute is unique in a dataframe Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe Returns: result (bool). Returns True, if the attribute contains unique values, else returns False Notes: This is an internal helper function """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') uniq_flag = (len(pd.unique(df[attr])) == len(df)) if not uniq_flag: return False else: return True
def is_attr_unique(df, attr): """ Check if the attribute is unique in a dataframe Args: df (pandas dataframe): Input dataframe attr (str): Attribute in the pandas dataframe Returns: result (bool). Returns True, if the attribute contains unique values, else returns False Notes: This is an internal helper function """ validate_object_type(df, pd.DataFrame) validate_object_type(attr, six.string_types, error_prefix='Input attr.') uniq_flag = (len(pd.unique(df[attr])) == len(df)) if not uniq_flag: return False else: return True
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def _parse_feat_str(feature_string, tokenizers, similarity_functions): """ This function parses the feature string to get left attribute, right attribute, tokenizer, similarity function """ # Validate the input parameters # # We expect the input feature string to be of type string validate_object_type(feature_string, six.string_types, error_prefix='Input feature') # # We expect the input object tokenizers to be of type python dictionary validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)') # # We expect the input object similarity functions to be of type python # dictionary validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)') # We will have to parse the feature string. Specifically we use pyparsing # module for the parsing purposes from pyparsing import Word, alphanums, ParseException # initialization attributes, tokenizers and similarity function parsing # result left_attribute = 'PARSE_EXP' right_attribute = 'PARSE_EXP' left_attr_tokenizer = 'PARSE_EXP' right_attr_tokenizer = 'PARSE_EXP' sim_function = 'PARSE_EXP' exception_flag = False # Define structures for each type such as attribute name, tokenizer # function attr_name = Word(alphanums + "_" + "." + "[" + "]" + '"' + "'") tok_fn = Word(alphanums + "_") + "(" + attr_name + ")" wo_tok = Word(alphanums + "_") + "(" + attr_name + "," + attr_name + ")" wi_tok = Word(alphanums + "_") + "(" + tok_fn + "," + tok_fn + ")" feat = wi_tok | wo_tok # Try to parse the string try: parsed_string = feat.parseString(feature_string) except ParseException as _: exception_flag = True if not exception_flag: # Parse the tokenizers parsed_tokenizers = [value for value in parsed_string if value in tokenizers.keys()] if len(parsed_tokenizers) is 2: left_attr_tokenizer = parsed_tokenizers[0] right_attr_tokenizer = parsed_tokenizers[1] # Parse the similarity functions parsed_similarity_function = [value for value in parsed_string if value in similarity_functions.keys()] if len(parsed_similarity_function) == 1: sim_function = parsed_similarity_function[0] # Parse the left attribute attribute = [value for value in parsed_string if value.startswith('ltuple[')] if len(attribute) == 1: attribute = attribute[0] left_attribute = attribute[7:len(attribute) - 1].strip('"').strip( "'") # Parse the right attribute attribute = [val for val in parsed_string if val.startswith('rtuple[')] if len(attribute) == 1: attribute = attribute[0] right_attribute = attribute[7:len(attribute) - 1].strip('"').strip( "'") else: pass # Return the parsed information in a dictionary format. parsed_dict = {'left_attribute': left_attribute, 'right_attribute': right_attribute, 'left_attr_tokenizer': left_attr_tokenizer, 'right_attr_tokenizer': right_attr_tokenizer, 'simfunction': sim_function, 'is_auto_generated': False} return parsed_dict
def add_feature(feature_table, feature_name, feature_dict): """ Adds a feature to the feature table. Specifically, this function is used in combination with :meth:`~py_entitymatching.get_feature_fn`. First the user creates a dictionary using :meth:`~py_entitymatching.get_feature_fn`, then the user uses this function to add feature_dict to the feature table. Args: feature_table (DataFrame): A DataFrame containing features. feature_name (string): The name that should be given to the feature. feature_dict (dictionary): A Python dictionary, that is typically returned by executing :meth:`~py_entitymatching.get_feature_fn`. Returns: A Boolean value of True is returned if the addition was successful. Raises: AssertionError: If the input `feature_table` is not of type pandas DataFrame. AssertionError: If `feature_name` is not of type string. AssertionError: If `feature_dict` is not of type Python dictionary. AssertionError: If the `feature_table` does not have necessary columns such as 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', and 'function_source' in the DataFrame. AssertionError: If the `feature_name` is already present in the feature table. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_t = em.get_tokenizers_for_blocking() >>> block_s = em.get_sim_funs_for_blocking() >>> block_f = em.get_features_for_blocking(A, B) >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s) >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r) >>> match_t = em.get_tokenizers_for_matching() >>> match_s = em.get_sim_funs_for_matching() >>> match_f = em.get_features_for_matching(A, B) >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s) >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r) """ # Validate input parameters # # We expect the feature_table to be of pandas DataFrame validate_object_type(feature_table, pd.DataFrame, 'Input feature table') # # We expect the feature_name to be of type string validate_object_type(feature_name, six.string_types, 'Input feature name') # # We expect the feature_dict to be of type python dictionary validate_object_type(feature_dict, dict, 'Input feature dictionary') # # We expect the feature table to contain certain columns missing_columns = get_missing_column_values(feature_table.columns) if missing_columns: error_msg = "Feature table does not have all required columns\n The following columns are missing: {0}".format(", ".join(missing_columns)) raise AssertionError(error_msg) feature_names = list(feature_table['feature_name']) if feature_name in feature_names: logger.error('Input feature name is already present in feature table') raise AssertionError( 'Input feature name is already present in feature table') # Add feature to the feature table at last feature_dict['feature_name'] = feature_name if len(feature_table) > 0: feature_table.loc[len(feature_table)] = feature_dict else: feature_table.columns = ['feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', 'is_auto_generated'] feature_table.loc[len(feature_table)] = feature_dict # Finally, return True if everything was fine return True
def block_tables(self, ltable, rtable, l_block_attr, r_block_attr, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on attribute equivalence. Conceptually, this will check `l_block_attr=r_block_attr` for each tuple pair from the Cartesian product of tables `ltable` and `rtable`. It outputs a Pandas dataframe object with tuple pairs that satisfy the equality condition. The dataframe will include attributes '_id', key attribute from ltable, key attributes from rtable, followed by lists `l_output_attrs` and `r_output_attrs` if they are specified. Each of these output and key attributes will be prefixed with given `l_output_prefix` and `r_output_prefix`. If `allow_missing` is set to `True` then all tuple pairs with missing value in at least one of the tuples will be included in the output dataframe. Further, this will update the following metadata in the catalog for the output table: (1) key, (2) ltable, (3) rtable, (4) fk_ltable, and (5) fk_rtable. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C1 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) # Include all possible tuple pairs with missing values >>> C2 = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True) """ logger.warning("WARNING THIS BLOCKER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR " "OWN RISK.") # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # last arg is # set to 1 just to reuse the function from the # old blocker. # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # validate data type of allow_missing self.validate_allow_missing(allow_missing) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # do blocking # # do projection of required attributes from the tables l_proj_attrs = self.get_attrs_to_project(l_key, l_block_attr, l_output_attrs) ltable_proj = ltable[l_proj_attrs] r_proj_attrs = self.get_attrs_to_project(r_key, r_block_attr, r_output_attrs) rtable_proj = rtable[r_proj_attrs] # # remove records with nans in the blocking attribute l_df = rem_nan(ltable_proj, l_block_attr) r_df = rem_nan(rtable_proj, r_block_attr) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) else: l_splits = np.array_split(l_df, n_ltable_chunks) r_splits = np.array_split(r_df, n_rtable_chunks) c_splits = [] for l in l_splits: for r in r_splits: partial_result = delayed(_block_tables_split)(l, r, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, allow_missing) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) c_splits = c_splits.compute(scheduler="processes", n_jobs=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # if allow_missing flag is True, then compute # all pairs with missing value in left table, and # all pairs with missing value in right table if allow_missing: missing_pairs = self.get_pairs_with_missing_value(ltable_proj, rtable_proj, l_key, r_key, l_block_attr, r_block_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) candset = pd.concat([candset, missing_pairs], ignore_index=True) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def _vis_debug_dt(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError( 'PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type DTMatcher if not isinstance(matcher, DTMatcher): logger.error('Input matcher is not of type Decision Tree matcher') raise AssertionError('Input matcher is not of type ' 'Decision Tree matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not ch.check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the ' 'train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not ch.check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the ' 'train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not ch.check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the ' 'test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = eval_matches(predicted, target_attr, predict_attr_name) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) app = em._viewapp # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "dt", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def get_attr_corres(ltable, rtable): """ This function gets the attribute correspondences between the attributes of ltable and rtable. The user may need to get the correspondences so that he/she can generate features based those correspondences. Args: ltable,rtable (DataFrame): Input DataFrames for which the attribute correspondences must be obtained. Returns: A Python dictionary is returned containing the attribute correspondences. Specifically, this returns a dictionary with the following key-value pairs: corres: points to the list correspondences as tuples. Each correspondence is a tuple with two attributes: one from ltable and the other from rtable. ltable: points to ltable. rtable: points to rtable. Currently, 'corres' contains only pairs of attributes with exact names in ltable and rtable. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_c = em.get_attr_corres(A, B) """ # Validate input parameters # # We expect the input object (ltable) to be of type pandas # DataFrame validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable') # # We expect the input object (rtable) to be of type pandas # DataFrame validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable') # Initialize the correspondence list correspondence_list = [] # Check for each column in ltable, if column exists in rtable, # If so, add it to the correspondence list. # Note: This may not be the fastest way to implement this. We could # refactor this later. for column in ltable.columns: if column in rtable.columns: correspondence_list.append((column, column)) # Initialize a correspondence dictionary. correspondence_dict = dict() # Fill the corres, ltable and rtable. correspondence_dict['corres'] = correspondence_list correspondence_dict['ltable'] = ltable correspondence_dict['rtable'] = rtable # Finally, return the correspondence dictionary return correspondence_dict
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def eval_matches(data_frame, gold_label_attr, predicted_label_attr): """ Evaluates the matches from the matcher. Specifically, given a DataFrame containing golden labels and predicted labels, this function would evaluate the matches and return the accuracy results such as precision, recall and F1. Args: data_frame (DataFrame): The input pandas DataFrame containing "gold" labels and "predicted" labels. gold_label_attr (string): An attribute in the input DataFrame containing "gold" labels. predicted_label_attr (string): An attribute in the input DataFrame containing "predicted" labels. Returns: A Python dictionary containing the accuracy measures such as precision, recall, F1. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `gold_label_attr` is not of type string. AssertionError: If `predicted_label_attr` is not of type string. AssertionError: If the `gold_label_attr` is not in the input dataFrame. AssertionError: If the `predicted_label_attr` is not in the input dataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') """ # Validate input parameters # # We expect the input object to be of type pandas DataFrame validate_object_type(data_frame, pd.DataFrame, 'The input table') # # We expect the input attribute (gold_label_attr) to be of type string validate_object_type(gold_label_attr, six.string_types, 'The input gold_label_attr') # # We expect the input attribute (predicted_label_attr) to be of type # string validate_object_type(predicted_label_attr, six.string_types, 'The input predicted_label_attr') # Check whether the gold label attribute is present in the input table if not ch.check_attrs_present(data_frame, gold_label_attr): logger.error( 'The gold_label_attr is not present in the input DataFrame') raise AssertionError( 'The gold_label_attr is not present in the input DataFrame') # Check whether the predicted label attribute is present in the input table if not ch.check_attrs_present(data_frame, predicted_label_attr): logger.error( 'The predicted_label_attr is not present in the input DataFrame') raise AssertionError( 'The predicted_label_attr is not present in the input DataFrame') # Reset the index to get the indices set as 0..len(table) new_data_frame = data_frame.reset_index(drop=False, inplace=False) # Project out the gold and label attributes. gold = new_data_frame[gold_label_attr] predicted = new_data_frame[predicted_label_attr] # Get gold negatives, positives gold_negative = gold[gold == 0].index.values gold_positive = gold[gold == 1].index.values # Get predicted negatives, positives predicted_negative = predicted[predicted == 0].index.values predicted_positive = predicted[predicted == 1].index.values # get false positive indices false_positive_indices = \ list(set(gold_negative).intersection(predicted_positive)) # get true positive indices true_positive_indices = \ list(set(gold_positive).intersection(predicted_positive)) # get false negative indices false_negative_indices = \ list(set(gold_positive).intersection(predicted_negative)) # get true negative indices true_negative_indices = \ list(set(gold_negative).intersection(predicted_negative)) # Get the number of TP, FP, FN, TN num_true_positives = float(len(true_positive_indices)) num_false_positives = float(len(false_positive_indices)) num_false_negatives = float(len(false_negative_indices)) num_true_negatives = float(len(true_negative_indices)) # Precision = num_tp/ (num_tp + num_fp) # Get precision numerator, denominator precision_numerator = num_true_positives precision_denominiator = num_true_positives + num_false_positives # Precision = num_tp/ (num_tp + num_fn) # Get recall numerator, denominator recall_numerator = num_true_positives recall_denominator = num_true_positives + num_false_negatives # Compute precision if precision_denominiator == 0.0: precision = 0.0 else: precision = precision_numerator / precision_denominiator # Compute recall if recall_denominator == 0.0: recall = 0.0 else: recall = recall_numerator / recall_denominator # Compute F1 if precision == 0.0 and recall == 0.0: F1 = 0.0 else: F1 = (2.0 * precision * recall) / (precision + recall) # Get the fk_ltable and fk_rtable fk_ltable = cm.get_property(data_frame, 'fk_ltable') fk_rtable = cm.get_property(data_frame, 'fk_rtable') # Check if the fk_ltable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_ltable): logger.error('The fk_ltable (%s) contains missing values' % fk_ltable) raise AssertionError('The fk_ltable (%s) contains missing values' % fk_ltable) # Check if the fk_rtable contain any missing values if ch.does_contain_missing_vals(data_frame, fk_rtable): logger.error('The fk_rtable (%s) contains missing values' % fk_rtable) raise AssertionError('The fk_rtable (%s) contains missing values' % fk_rtable) # Set the index values to fk_ltable and fk_rtable new_data_frame.set_index([fk_ltable, fk_rtable], drop=False, inplace=True) # Get the list of false positives and false negatives. false_pos_ls = list( new_data_frame.iloc[false_positive_indices].index.values) false_neg_ls = list( new_data_frame.iloc[false_negative_indices].index.values) # Store and return the accuracy results. accuracy_results = collections.OrderedDict() accuracy_results['prec_numerator'] = precision_numerator accuracy_results['prec_denominator'] = precision_denominiator accuracy_results['precision'] = precision accuracy_results['recall_numerator'] = recall_numerator accuracy_results['recall_denominator'] = recall_denominator accuracy_results['recall'] = recall accuracy_results['f1'] = F1 accuracy_results['pred_pos_num'] = num_true_positives + num_false_positives accuracy_results['false_pos_num'] = num_false_positives accuracy_results['false_pos_ls'] = false_pos_ls accuracy_results['pred_neg_num'] = num_false_negatives + num_true_negatives accuracy_results['false_neg_num'] = num_false_negatives accuracy_results['false_neg_ls'] = false_neg_ls return accuracy_results
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on a black box blocking function specified by the user. Finds tuple pairs from left and right tables that survive the black box function. A tuple pair survives the black box blocking function if the function returns False for that pair, otherwise the tuple pair is dropped. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If values in `l_output_attrs` is not of type string. AssertionError: If values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. Examples: >>> def match_last_name(ltuple, rtuple): # assume that there is a 'name' attribute in the input tables # and each value in it has two words l_last_name = ltuple['name'].split()[1] r_last_name = rtuple['name'].split()[1] if l_last_name != r_last_name: return True else: return False >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_black_box_blocker DaskBlackBoxBlocker >>> bb = DaskBlackBoxBlocker() >>> bb.set_black_box_function(match_last_name) >>> C = bb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'] ) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of standard input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate black box function assert self.black_box_function != None, 'Black box function is not set' # validate output attributes self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # do blocking # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # pickle the black-box function before passing it as an arg to # # _block_tables_split to be executed by each child process black_box_function_pkl = cp.dumps(self.black_box_function) if n_ltable_chunks == 1 and n_rtable_chunks == 1: # single process candset = _block_tables_split(l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, show_progress) else: # multiprocessing l_splits = pd.np.array_split(l_df, n_ltable_chunks) r_splits = pd.np.array_split(r_df, n_rtable_chunks) c_splits = [] for i in range(len(l_splits)): for j in range(len(r_splits)): partial_result = delayed(_block_tables_split)( l_splits[i], r_splits[j], l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, black_box_function_pkl, False) c_splits.append(partial_result) c_splits = delayed(wrap)(c_splits) if show_progress: with ProgressBar(): c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: c_splits = c_splits.compute(scheduler="processes", num_workers=get_num_cores()) candset = pd.concat(c_splits, ignore_index=True) # # determine the attributes to retain in the output candidate set retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def check_fk_constraint(df_foreign, attr_foreign, df_base, attr_base): """ Check if the foreign key is a primary key Args: df_foreign (pandas dataframe): Foreign dataframe attr_foreign (str): Attribute in the foreign dataframe df_base (pandas dataframe): Base dataframe attr_base (str): Attribute in the base dataframe Returns: result (bool). Returns True if the foreign key contraint is satisfied, else returns False Notes: This is an internal helper function """ validate_object_type(df_foreign, pd.DataFrame, error_prefix='Input object (df_foreign)') validate_object_type(attr_foreign, six.string_types, error_prefix='Input attr (attr_foreign)') validate_object_type(df_base, pd.DataFrame, error_prefix='Input object (df_base)') validate_object_type(attr_base, six.string_types, error_prefix='Input attr (attr_base)') if not check_attrs_present(df_base, attr_base): logger.warning('The attribute %s is not in df_base' % attr_base) return False if not check_attrs_present(df_foreign, attr_foreign): logger.error('Input attr (attr_foreign) is not in df_foreign') return False if any(pd.isnull(df_foreign[attr_foreign])): logger.warning( 'The attribute %s in foreign table contains null values' % attr_foreign) return False uniq_fk_vals = set(pd.unique(df_foreign[attr_foreign])) base_attr_vals = df_base[attr_base].values d = uniq_fk_vals.difference(base_attr_vals) if len(d) > 0: logger.warning( 'For some attr. values in (%s) in the foreign table there are no values in ' '(%s) in the base table' % (attr_foreign, attr_base)) return False # check whether those values are unique in the base table. t = df_base[df_base[attr_base].isin(pd.unique(df_foreign[attr_foreign]))] status = is_key_attribute(t, attr_base) if status == False: logger.warning( 'Key attr. constraint for the subset of values (derived from. %s)' 'in %s is not satisifed' % (attr_foreign, attr_base)) return False else: return True
def block_candset(self, candset, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks an input candidate set of tuple pairs based on a sequence of blocking rules supplied by the user. Finds tuple pairs from an input candidate set of tuple pairs that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked (dropped). Args: candset (DataFrame): The input candidate set of tuple pairs. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> D = rb.block_tables(C) # C is the candidate set. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # get and validate metadata log_info( logger, 'Required metadata: cand.set key, fk ltable, ' + 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) # do blocking # # initialize the progress bar # if show_progress: # bar = pyprind.ProgBar(len(candset)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, [], []) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] c_df = self.block_candset_excluding_rule(candset, l_df, r_df, l_key, r_key, fk_ltable, fk_rtable, None, show_progress, n_chunks) # update catalog cm.set_candset_properties(c_df, key, fk_ltable, fk_rtable, ltable, rtable) # return candidate set return c_df
def execute(self, input_table, label_column, inplace=True, verbose=False): """ Executes the rules of the match trigger for a table of matcher results. Args: input_table (DataFrame): The input table of type pandas DataFrame containing tuple pairs and labels from matching (defaults to None). label_column (string): The attribute name where the predictions are stored in the input table (defaults to None). inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). Returns: A DataFrame with predictions updated. Examples: >>> import py_entitymatching as em >>> mt = em.MatchTrigger() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> match_f = em.get_features_for_matching(A, B) >>> rule = ['title_title_lev_sim(ltuple, rtuple) > 0.7'] >>> mt.add_cond_rule(rule, match_f) >>> mt.add_cond_status(True) >>> mt.add_action(1) >>> # The table H is a table with prediction labels generated from matching >>> mt.execute(input_table=H, label_column='predicted_labels', inplace=False) """ # Validate input parameters # # We expect the table to be of type pandas DataFrame validate_object_type(input_table, pd.DataFrame, 'Input table') # # We expect the target_attr to be of type string if not None if label_column is not None and not isinstance(label_column, str): logger.error('Input target_attr must be a string.') raise AssertionError('Input target_attr must be a string.') # # We expect the inplace to be of type boolean validate_object_type(inplace, bool, 'Input inplace') # # We expect the verbose to be of type boolean validate_object_type(verbose, bool, 'Input append') # Validate that there are some rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( input_table, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(input_table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) assert ltable is not None, 'Left table is not set' assert rtable is not None, 'Right table is not set' assert label_column in input_table.columns, 'Label column not in the input table' # Parse conjuncts to validate that the features are in the feature table for rule in self.rule_conjunct_list: for conjunct in self.rule_conjunct_list[rule]: parse_conjunct(conjunct, self.rule_ft[rule]) if inplace == False: table = input_table.copy() else: table = input_table # set the index and store it in l_tbl/r_tbl l_tbl = ltable.set_index(l_key, drop=False) r_tbl = rtable.set_index(r_key, drop=False) # keep track of valid ids y = [] column_names = list(input_table.columns) lid_idx = column_names.index(fk_ltable) rid_idx = column_names.index(fk_rtable) label_idx = column_names.index(label_column) idx = 0 for row in input_table.itertuples(index=False): if row[label_idx] != self.value_to_set: l_row = l_tbl.loc[row[lid_idx]] r_row = r_tbl.loc[row[rid_idx]] res = self.apply_rules(l_row, r_row) if res == self.cond_status: table.iat[idx, label_idx] = self.value_to_set idx += 1 return table
def block_tables(self, ltable, rtable, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK Blocks two tables based on the sequence of rules supplied by the user. Finds tuple pairs from left and right tables that survive the sequence of blocking rules. A tuple pair survives the sequence of blocking rules if none of the rules in the sequence returns True for that pair. If any of the rules returns True, then the pair is blocked. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived the sequence of blocking rules (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If the input `l_output_prefix` is not of type string. AssertionError: If the input `r_output_prefix` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_out_attrs` are not in the ltable. AssertionError: If `r_out_attrs` are not in the rtable. AssertionError: If there are no rules to apply. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_rule_based_blocker import DaskRuleBasedBlocker >>> rb = DaskRuleBasedBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> block_f = em.get_features_for_blocking(A, B) >>> rule = ['name_name_lev(ltuple, rtuple) > 3'] >>> rb.add_rule(rule, feature_table=block_f) >>> C = rb.block_tables(A, B) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # validate data types of input parameters self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, 1) # validate data type of show_progress self.validate_show_progress(show_progress) # validate input parameters self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) # get and validate metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # get metadata l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # validate number of ltable and rtable chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) # # determine the number of chunks n_ltable_chunks = get_num_partitions(n_ltable_chunks, len(ltable)) n_rtable_chunks = get_num_partitions(n_rtable_chunks, len(rtable)) # # set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # remove l_key from l_output_attrs and r_key from r_output_attrs l_output_attrs_1 = [] if l_output_attrs: l_output_attrs_1 = [x for x in l_output_attrs if x != l_key] r_output_attrs_1 = [] if r_output_attrs: r_output_attrs_1 = [x for x in r_output_attrs if x != r_key] # # get attributes to project l_proj_attrs, r_proj_attrs = self.get_attrs_to_project( l_key, r_key, l_output_attrs_1, r_output_attrs_1) l_df, r_df = l_df[l_proj_attrs], r_df[r_proj_attrs] candset, rule_applied = self.block_tables_with_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, get_num_cores()) # pass number of splits as # the number of cores in the machine if candset is None: # no filterable rule was applied candset = self.block_tables_without_filters( l_df, r_df, l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix, verbose, show_progress, n_ltable_chunks, n_rtable_chunks) elif len(self.rules) > 1: # one filterable rule was applied but other rules are left # block candset by applying other rules and excluding the applied rule candset = self.block_candset_excluding_rule( candset, l_df, r_df, l_key, r_key, l_output_prefix + l_key, r_output_prefix + r_key, rule_applied, show_progress, get_num_cores()) retain_cols = self.get_attrs_to_retain(l_key, r_key, l_output_attrs_1, r_output_attrs_1, l_output_prefix, r_output_prefix) if len(candset) > 0: candset = candset[retain_cols] else: candset = pd.DataFrame(columns=retain_cols) # update catalog key = get_name_for_key(candset.columns) candset = add_key_column(candset, key) cm.set_candset_properties(candset, key, l_output_prefix + l_key, r_output_prefix + r_key, ltable, rtable) # return candidate set return candset
def debug_randomforest_matcher(random_forest, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs=None): """ This function is used to debug a random forest matcher using two input tuples. Specifically, this function takes in two tuples, gets the feature vector using the feature table and finally passes it to the random forest and displays the path that the feature vector takes in each of the decision trees that make up the random forest matcher. Args: random_forest (RFMatcher): The input random forest object that should be debugged. tuple_1,tuple_2 (Series): Input tuples that should be debugged. feature_table (DataFrame): Feature table containing the functions for the features. table_columns (list): List of all columns that will be outputted after generation of feature vectors. exclude_attrs (list): List of attributes that should be removed from the table columns. Raises: AssertionError: If the input feature table is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # devel is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(devel, feat_table=match_f, attrs_after='gold_labels') >>> rf = em.RFMatcher() >>> rf.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # F is the feature vector got from evaluation set of the labeled data. >>> out = rf.predict(table=F, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> # A and B are input tables >>> em.debug_randomforest_matcher(rf, A.ix[1], B.ix[2], match_f, H.columns, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') """ # Validate input parameters. # # We expect the feature table to be of type pandas DataFrame validate_object_type(feature_table, pd.DataFrame, error_prefix='The input feature') # Get the classifier based on the input object. i = 1 if isinstance(random_forest, RFMatcher): clf = random_forest.clf else: clf = random_forest # Get the feature names based on the table columns and the exclude # attributes given if exclude_attrs is None: feature_names = table_columns else: cols = [c not in exclude_attrs for c in table_columns] feature_names = table_columns[cols] # Get the probability prob = _get_prob(clf, tuple_1, tuple_2, feature_table, feature_names) # Decide prediction based on the probability (i.e num. of trees that said # match over total number of trees). prediction = False if prob[1] > prob[0]: prediction = True # Print the result summary. print("Summary: Num trees = {0}; Mean Prob. for non-match = {1}; " "Mean Prob for match = {2}; " "Match status = {3}".format(str(len(clf.estimators_)), str(prob[0]), str(prob[1]), str(prediction))) print("") # Now, for each estimator (i.e the decision tree call the decision tree # matcher's debugger for estimator in clf.estimators_: print("Tree " + str(i)) i += 1 _ = _debug_decisiontree_matcher(estimator, tuple_1, tuple_2, feature_table, table_columns, exclude_attrs, ensemble_flag=True) print("")
def block_candset(self, candset, l_block_attr, r_block_attr, allow_missing=False, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks an input candidate set of tuple pairs based on attribute equivalence. Finds tuple pairs from an input candidate set of tuple pairs such that the value of attribute l_block_attr of the left tuple in a tuple pair exactly matches the value of attribute r_block_attr of the right tuple in the tuple pair. Args: candset (DataFrame): The input candidate set of tuple pairs. l_block_attr (string): The blocking attribute in left table. r_block_attr (string): The blocking attribute in right table. allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple pair with missing value in either blocking attribute will be retained in the output candidate set. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `l_block_attr` is not of type string. AssertionError: If `r_block_attr` is not of type string. AssertionError: If `verbose` is not of type boolean. AssertionError: If `n_chunks` is not of type int. AssertionError: If `l_block_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_attr_equiv_blocker import DaskAttrEquivalenceBlocker >>> ab = DaskAttrEquivalenceBlocker() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode', l_output_attrs=['name'], r_output_attrs=['name']) >>> D1 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Include all possible tuple pairs with missing values >>> D2 = ab.block_candset(C, 'age', 'age', allow_missing=True) # Execute blocking using multiple cores >>> D3 = ab.block_candset(C, 'age', 'age', n_chunks=-1) """ logger.warning("WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validate data types of input parameters self.validate_types_params_candset(candset, verbose, show_progress, n_chunks) # validate data types of input blocking attributes self.validate_types_block_attrs(l_block_attr, r_block_attr) # get and validate metadata log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( candset, logger, verbose) # # validate metadata cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # validate input parameters self.validate_block_attrs(ltable, rtable, l_block_attr, r_block_attr) # validate n_chunks parameter validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) # do blocking # # do projection before merge l_df = ltable[[l_key, l_block_attr]] r_df = rtable[[r_key, r_block_attr]] # # set index for convenience l_df = l_df.set_index(l_key, drop=False) r_df = r_df.set_index(r_key, drop=False) # # determine number of processes to launch parallely n_chunks = get_num_partitions(n_chunks, len(candset)) valid = [] if n_chunks == 1: # single process valid = _block_candset_split(candset, l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, show_progress) else: c_splits = np.array_split(candset, n_chunks) valid_splits = [] for i in range(len(c_splits)): partial_result = delayed(_block_candset_split)(c_splits[i], l_df, r_df, l_key, r_key, l_block_attr, r_block_attr, fk_ltable, fk_rtable, allow_missing, False) # setting show # progress to False as we will use Dask diagnostics to display progress # bar valid_splits.append(partial_result) valid_splits = delayed(wrap)(valid_splits) if show_progress: with ProgressBar(): valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) else: valid_splits = valid_splits.compute(scheduler="processes", num_workers=get_num_cores()) valid = sum(valid_splits, []) # construct output table if len(candset) > 0: out_table = candset[valid] else: out_table = pd.DataFrame(columns=candset.columns) # update the catalog cm.set_candset_properties(out_table, key, fk_ltable, fk_rtable, ltable, rtable) # return the output table return out_table
def get_attr_corres(ltable, rtable): """ This function gets the attribute correspondences between the attributes of ltable and rtable. The user may need to get the correspondences so that he/she can generate features based those correspondences. Args: ltable,rtable (DataFrame): Input DataFrames for which the attribute correspondences must be obtained. Returns: A Python dictionary is returned containing the attribute correspondences. Specifically, this returns a dictionary with the following key-value pairs: corres: points to the list correspondences as tuples. Each correspondence is a tuple with two attributes: one from ltable and the other from rtable. ltable: points to ltable. rtable: points to rtable. Currently, 'corres' contains only pairs of attributes with exact names in ltable and rtable. Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_c = em.get_attr_corres(A, B) """ # Validate input parameters # # We expect the input object (ltable) to be of type pandas # DataFrame validate_object_type(ltable, pd.DataFrame, error_prefix='Input ltable') # # We expect the input object (rtable) to be of type pandas # DataFrame validate_object_type(rtable, pd.DataFrame, error_prefix='Input rtable') # Initialize the correspondence list correspondence_list = [] # Check for each column in ltable, if column exists in rtable, # If so, add it to the correspondence list. # Note: This may not be the fastest way to implement this. We could # refactor this later. for column in ltable.columns: if column in rtable.columns: correspondence_list.append((column, column)) # Initialize a correspondence dictionary. correspondence_dict = dict() # Fill the corres, ltable and rtable. correspondence_dict['corres'] = correspondence_list correspondence_dict['ltable'] = ltable correspondence_dict['rtable'] = rtable # Finally, return the correspondence dictionary return correspondence_dict
def _vis_debug_rf(matcher, train, test, exclude_attrs, target_attr, show_window=True): """ Wrapper function for debugging the Random Forest matcher visually. """ try: from PyQt5 import QtWidgets from py_entitymatching.gui.debug_gui_base import MainWindowManager except ImportError: raise ImportError('PyQt5 is not installed. Please install PyQt5 to use ' 'GUI related functions in py_entitymatching.') # Validate the input parameters # # We expect the matcher to be of type RfMatcher if not isinstance(matcher, RFMatcher): logger.error('Input matcher is not of type ' 'Random Forest matcher') raise AssertionError('Input matcher is not of type ' 'Random Forest matcher') # # We expect the target attribute to be of type string. validate_object_type(target_attr, six.string_types, error_prefix='Target attribute') # # Check whether the exclude attributes are indeed present in the train # DataFrame. if not check_attrs_present(train, exclude_attrs): logger.error('The exclude attrs are not in train table columns') raise AssertionError('The exclude attrs are not in the train table columns') # # Check whether the target attribute is indeed present in the train # DataFrame. if not check_attrs_present(train, target_attr): logger.error('The target attr is not in train table columns') raise AssertionError('The target attr is not in the train table columns') # # Check whether the exclude attributes are indeed present in the test # DataFrame. if not check_attrs_present(test, exclude_attrs): logger.error('The exclude attrs are not in test table columns') raise AssertionError('The exclude attrs are not in the test table columns') # The exclude attributes is expected to be of type list, if not # explicitly convert this into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # If the target attribute is not present in the exclude attributes, # then explicitly add it to the exclude attributes. if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, fit using training data matcher.fit(table=train, exclude_attrs=exclude_attrs, target_attr=target_attr) # Get a column name to store the predictions. predict_attr_name = get_name_for_predict_column(test.columns) # Predict using the test data predicted = matcher.predict(table=test, exclude_attrs=exclude_attrs, target_attr=predict_attr_name, append=True, inplace=False) # Get the evaluation summary. eval_summary = em.eval_matches(predicted, target_attr, predict_attr_name) em._viewapp = QtWidgets.QApplication.instance() if em._viewapp is None: em._viewapp = QtWidgets.QApplication([]) # Get metric in a form that can be displayed from the evaluation summary metric = _get_metric(eval_summary) # Get false negatives and false positives as a DataFrame fp_dataframe = _get_dataframe(predicted, eval_summary['false_pos_ls']) fn_dataframe = _get_dataframe(predicted, eval_summary['false_neg_ls']) # Get the main window application app = em._viewapp m = MainWindowManager(matcher, "rf", exclude_attrs, metric, predicted, fp_dataframe, fn_dataframe) # If the show window is true, then display the window. if show_window: m.show() app.exec_()
def predict(self, table=None, target_attr=None, append=False, inplace=True): """Predict interface for the matcher. A point to note is all the input parameters have a default value of None. Args: table (DataFrame): The input candidate set of type pandas DataFrame containing tuple pairs (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). Returns: An array of predictions or a DataFrame with predictions updated. Examples: >>> import py_entitymatching as em >>> brm = em.BooleanRuleMatcher() >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='id') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='id') >>> match_f = em.get_features_for_matching(A, B) >>> rule = ['address_address_lev(ltuple, rtuple) > 6'] >>> brm.add_rule(rule, match_f) >>> # The table S is a cand set generated by the blocking and then labeling phases >>> brm.predict(S, target_attr='pred_label', append=True) """ # Validate input parameters # # We expect the table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame, 'Input table') # # We expect the target_attr to be of type string if not None if target_attr is not None and not isinstance(target_attr, str): logger.error('Input target_attr must be a string.') raise AssertionError('Input target_attr must be a string.') # # We expect the append to be of type boolean validate_object_type(append, bool, 'Input append') # # We expect the inplace to be of type boolean validate_object_type(inplace, bool, 'Input inplace') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset( table, logger, False) # # validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, False) # Validate that there are some rules assert len(self.rules.keys()) > 0, 'There are no rules to apply' # Parse conjuncts to validate that the features are in the feature table for rule in self.rule_conjunct_list: for conjunct in self.rule_conjunct_list[rule]: parse_conjunct(conjunct, self.rule_ft[rule]) if table is not None: y = self._predict_candset(table) if target_attr is not None and append is True: if inplace == True: table[target_attr] = y return table else: tbl = table.copy() tbl[target_attr] = y return tbl else: return y else: raise SyntaxError('The arguments supplied does not match the signatures supported !!!')
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False, seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1, n_sample_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. This command down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: ltable (DataFrame): The left input table, i.e., table A. rtable (DataFrame): The right input table, i.e., table B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). rem_stop_words (boolean): A flag to indicate whether a default set of stop words must be removed. rem_puncs (boolean): A flag to indicate whether the punctuations must be removed from the strings. n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it is set to -1, the number of partitions will be set to the number of cores in the machine. n_sample_rtable_chunks (int): The number of partitions for the sampled rtable (defaults to 1) Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. AssertionError: If `verbose` is not of type bool. AssertionError: If `show_progress` is not of type bool. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_sample_rtable_chunks` is not of type int. Examples: >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validation checks if not isinstance(ltable, pd.DataFrame): logger.error('Input table A (ltable) is not of type pandas DataFrame') raise AssertionError( 'Input table A (ltable) is not of type pandas DataFrame') if not isinstance(rtable, pd.DataFrame): logger.error('Input table B (rtable) is not of type pandas DataFrame') raise AssertionError( 'Input table B (rtable) is not of type pandas DataFrame') if len(ltable) == 0 or len(rtable) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(rtable) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words') validate_object_type(rem_puncs, bool, 'Parameter rem_puncs') validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks') rtable_sampled = sample_right_table(rtable, size, seed) ltbl_str_cols = _get_str_cols_list(ltable) proj_ltable = ltable[ltable.columns[ltbl_str_cols]] if n_ltable_chunks == -1: n_ltable_chunks = get_num_cores() ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks) preprocessed_tokenized_tbl = [] # Use Dask to preprocess and tokenize strings. start_row_id = 0 for i in range(len(ltable_chunks)): # start_row_id is internally used by process_tokenize_concat strings to map # each to its row id in the ltable. result = delayed(process_tokenize_concat_strings)(ltable_chunks[i], start_row_id, rem_puncs, rem_stop_words) preprocessed_tokenized_tbl.append(result) # update start_row_id start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl) # Now execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute( scheduler="processes", num_workers=get_num_cores()) else: preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute( scheduler="processes", num_workers=get_num_cores()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_tbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i]) # Build an inverted index inverted_index = build_inverted_index(ltable_processed_dict) # Preprocess/tokenize sampled rtable and probe rtbl_str_cols = _get_str_cols_list(rtable_sampled) proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]] if n_sample_rtable_chunks == -1: n_sample_rtable_chunks = get_num_cores() rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks) probe_result = [] # Create the DAG for i in range(len(rtable_chunks)): result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable), inverted_index, rem_puncs, rem_stop_words, seed) probe_result.append(result) probe_result = delayed(wrap)(probe_result) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) probe_result = map(list, probe_result) l_tbl_indices = set(sum(probe_result, [])) l_tbl_indices = list(l_tbl_indices) ltable_sampled = ltable.iloc[l_tbl_indices] # update catalog if cm.is_dfinfo_present(ltable): cm.copy_properties(ltable, ltable_sampled) if cm.is_dfinfo_present(rtable): cm.copy_properties(rtable, rtable_sampled) return ltable_sampled, rtable_sampled
def get_feature_fn(feature_string, tokenizers, similarity_functions): """ This function creates a feature in a declarative manner. Specifically, this function uses the feature string, parses it and compiles it into a function using the given tokenizers and similarity functions. This compiled function will take in two tuples and return a feature value (typically a number). Args: feature_string (string): A feature expression to be converted into a function. tokenizers (dictionary): A Python dictionary containing tokenizers. Specifically, the dictionary contains tokenizer names as keys and tokenizer functions as values. The tokenizer function typically takes in a string and returns a list of tokens. similarity_functions (dictionary): A Python dictionary containing similarity functions. Specifically, the dictionary contains similarity function names as keys and similarity functions as values. The similarity function typically takes in a string or two lists of tokens and returns a number. Returns: This function returns a Python dictionary which contains sufficient information (such as attributes, tokenizers, function code) to be added to the feature table. Specifically the Python dictionary contains the following keys: 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', and 'function_source'. For all the keys except the 'function' and 'function_source' the value will be either a valid string (if the input feature string is parsed correctly) or PARSE_EXP (if the parsing was not successful). The 'function' will have a valid Python function as value, and 'function_source' will have the Python function's source in string format. The created function is a self-contained function which means that the tokenizers and sim functions that it calls are bundled along with the returned function code. Raises: AssertionError: If `feature_string` is not of type string. AssertionError: If the input `tokenizers` is not of type dictionary. AssertionError: If the input `similarity_functions` is not of type dictionary. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_t = em.get_tokenizers_for_blocking() >>> block_s = em.get_sim_funs_for_blocking() >>> block_f = em.get_features_for_blocking(A, B) >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', block_t, block_s) >>> em.add_feature(block_f, 'name_name_jac_qgm3_qgm3', r) >>> match_t = em.get_tokenizers_for_matching() >>> match_s = em.get_sim_funs_for_matching() >>> match_f = em.get_features_for_matching(A, B) >>> r = get_feature_fn('jaccard(qgm_3(ltuple.name), qgm_3(rtuple.name)', match_t, match_s) >>> em.add_feature(match_f, 'name_name_jac_qgm3_qgm3', r) See Also: :meth:`py_entitymatching.get_sim_funs_for_blocking`, :meth:`py_entitymatching.get_tokenizers_for_blocking`, :meth:`py_entitymatching.get_sim_funs_for_matching`, :meth:`py_entitymatching.get_tokenizers_for_matching` """ # Validate input parameters # # We expect the input feature string to be of type string validate_object_type(feature_string, six.string_types, error_prefix='Input feature') # # We expect the input object tokenizers to be of type python dictionary validate_object_type(tokenizers, dict, error_prefix='Input object (tokenizers)') # # We expect the input object similarity functions to be of type python # dictionary validate_object_type(similarity_functions, dict, error_prefix='Input object (similarity_functions)') # Initialize a dictionary to have tokenizers/similarity functions dict_to_compile = {} # Update the dictionary with similarity functions if len(similarity_functions) > 0: dict_to_compile.update(similarity_functions) # Update the dictionary with tokenizers if len(tokenizers) > 0: dict_to_compile.update(tokenizers) # Create a python function string based on the input feature string function_string = 'def fn(ltuple, rtuple):\n' function_string += ' ' function_string += 'return ' + feature_string # Parse the feature string to get the tokenizer, sim. function, and the # attribute that it is operating on parsed_dict = _parse_feat_str(feature_string, tokenizers, similarity_functions) # Compile the function string using the constructed dictionary six.exec_(function_string, dict_to_compile) # Update the parsed dict with the function and the function source parsed_dict['function'] = dict_to_compile['fn'] parsed_dict['function_source'] = function_string # Finally, return the parsed dictionary return parsed_dict
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'): """ Saves a DataFrame to disk along with its metadata in a pickle format. This function saves a DataFrame to disk along with its metadata from the catalog. Specifically, this function saves the DataFrame in the given file path, and saves the metadata in the same directory (as the file path) but with a different extension. This extension can be optionally given by the user (defaults to '.pklmetadata'). Args: data_frame (DataFrame): The DataFrame that should be saved. file_path (string): The file path where the DataFrame must be stored. metadata_ext (string): The metadata extension that should be used while storing the metadata information. The default value is '.pklmetadata'. Returns: A Boolean value of True is returned if the DataFrame is successfully saved. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `file_path` is not of type string. AssertionError: If `metadata_ext` is not of type string. AssertionError: If a file cannot written in the given `file_path`. Examples: >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.save_table(A, './A.pkl') # will store two files ./A.pkl and ./A.pklmetadata >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> em.save_table(A, './A.pkl', metadata_ext='.pklmeta') # will store two files ./A.pkl and ./A.pklmeta See Also: :meth:`~py_entitymatching.load_table` Note: This function is a bit different from to_csv_metadata, where the DataFrame is stored in a CSV file format. The CSV file format can be viewed using a text editor. But a DataFrame stored using 'save_table' is stored in a special format, which cannot be viewed with a text editor. The reason we have save_table is, for larger DataFrames it is efficient to pickle the DataFrame to disk than writing the DataFrame in CSV format. """ # Validate the input parameters validate_object_type(data_frame, pd.DataFrame) validate_object_type(file_path, six.string_types, error_prefix='Input file path') validate_object_type(metadata_ext, six.string_types, error_prefix='Input Metadata ext') # Get the file_name (with out extension) and the extension from the given # file path. For example if the file_path was /Users/foo/file.csv then # the file_name will be /Users/foo/file and the extension will be '.csv' file_name, _ = os.path.splitext(file_path) # The metadata file name is the same file name but with the extension # given by the user metadata_filename = file_name + metadata_ext # Check if the file exists in the file_path and whether we have # sufficient access privileges to write in that path can_write, file_exists = ps._check_file_path(file_path) if can_write: # If the file already exists then issue a warning and overwrite the # file if file_exists: logger.warning( 'File already exists at %s; Overwriting it', file_path) # we open the file_path in binary mode, as we are writing in # binary format' with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # Looks like we cannot write the file in the given path. Raise an # error in this case. logger.error('Cannot write in the file path %s; Exiting', file_path) raise AssertionError('Cannot write in the file path %s', file_path) # Once we are done with writing the DataFrame, we will write the metadata # now # Initialize a metadata dictionary to hold the metadata of DataFrame from # the catalog metadata_dict = collections.OrderedDict() # get all the properties for the input data frame # # Check if the DataFrame information is present in the catalog properties = {} if cm.is_dfinfo_present(data_frame) is True: properties = cm.get_all_properties(data_frame) # If the properties are present in the catalog, then write properties to # disk if len(properties) > 0: for property_name, property_value in six.iteritems(properties): if isinstance(property_value, six.string_types) is True: metadata_dict[property_name] = property_value # try to save metadata can_write, file_exists = ps._check_file_path(metadata_filename) if can_write: # If the file already exists, then issue a warning and overwrite the # file if file_exists: logger.warning( 'Metadata file already exists at %s. Overwriting it', metadata_filename) # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: logger.warning( 'Cannot write metadata at the file path %s. Skip writing metadata ' 'file', metadata_filename) return True
def validate_types_params_candset(self, candset, verbose, show_progress, n_jobs): validate_object_type(candset, pd.DataFrame, 'Input candset') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(n_jobs, int, 'Parameter n_jobs')
def save_object(object_to_save, file_path): """ Saves a Python object to disk. This function is intended to be used to save py_entitymatching objects such as rule-based blocker, feature vectors, etc. A user would like to store py_entitymatching objects to disk, when he/she wants to save the workflow and resume it later. This function provides a way to save the required objects to disk. This function takes in the object to save the file path. It pickles the object and stores it in the file path specified. Args: object_to_save (Python object): The Python object to save. This can be a rule-based blocker, feature vectors, etc. file_path (string): The file path where the object must be saved. Returns: A Boolean value of True is returned, if the saving was successful. Raises: AssertionError: If `file_path` is not of type string. AssertionError: If a file cannot be written in the given `file_path`. Examples: >>> import pandas as pd >>> A = pd.DataFrame({'id' : [1, 2], 'colA':['a', 'b'], 'colB' : [10, 20]}) >>> B = pd.DataFrame({'id' : [1, 2], 'colA':['c', 'd'], 'colB' : [30, 40]}) >>> rb = em.RuleBasebBlocker() >>> block_f = em.get_features_for_blocking(A, B) >>> rule1 = ['colA_colA_lev_dist(ltuple, rtuple) > 3'] >>> rb.add_rule(rule1) >>> em.save_object(rb, './rule_blocker.pkl') See Also: :meth:`~load_object` """ # Validate input parameters validate_object_type(file_path, six.string_types, 'Input file path') # Check whether the file path is valid and if a file is already present # at that path. # noinspection PyProtectedMember can_write, file_exists = ps._check_file_path(file_path) # Check whether we can write if can_write: # If a file already exists in that location, issue a warning and # overwrite the file. if file_exists: logger.warning( 'File already exists at %s; Overwriting it', file_path) # we open the file in 'wb' mode as we are writing a binary file. with open(file_path, 'wb') as file_handler: cloudpickle.dump(object_to_save, file_handler) else: with open(file_path, 'wb') as file_handler: cloudpickle.dump(object_to_save, file_handler) # If we cannot write, then raise an error. else: logger.error('Cannot write in the file path %s; Exiting', file_path) raise AssertionError('Cannot write in the file path %s', file_path) # Return True if everything was successful. return True
def validate_allow_missing(self, allow_missing): validate_object_type(allow_missing, bool, 'Parameter allow_missing')
def load_table(file_path, metadata_ext='.pklmetadata'): """ Loads a pickled DataFrame from a file along with its metadata. This function loads a DataFrame from a file stored in pickle format. Further, this function looks for a metadata file with the same file name but with an extension given by the user (defaults to '.pklmetadata'. If the metadata file is present, the function will update the metadata for that DataFrame in the catalog. Args: file_path (string): The file path to load the file from. metadata_ext (string): The metadata file extension (defaults to '.pklmetadata') that should be used to generate metadata file name. Returns: If the loading is successful, the function will return a pandas DataFrame read from the file. The catalog will be updated with the metadata read from the metadata file (if the file was present). Raises: AssertionError: If `file_path` is not of type string. AssertionError: If `metadata_ext` is not of type string. Examples: >>> A = em.load_table('./A.pkl') >>> A = em.load_table('./A.pkl', metadata_ext='.pklmeta') See Also: :meth:`~py_entitymatching.save_table` Note: This function is different from read_csv_metadata in two aspects. First, this function currently does not support reading in candidate set tables, where there are more metadata such as ltable, rtable than just 'key', and conceptually the user is expected to provide ltable and rtable information while calling this function. ( this support will be added shortly). Second, this function loads the table stored in a pickle format. """ # Validate input parameters validate_object_type(file_path, six.string_types, error_prefix='Input file path') validate_object_type(metadata_ext, six.string_types) # Load the object from the file path. Note that we use a generic load # object to load in the DataFrame too. data_frame = load_object(file_path) # Load metadata from file path # # Check if the meta data file is present if ps._is_metadata_file_present(file_path, extension=metadata_ext): # Construct the metadata file name, and read it from the disk. # # Get the file name used to load the DataFrame file_name, _ = os.path.splitext(file_path) # # Construct the metadata file name metadata_filename = file_name + metadata_ext # # Load the metadata from the disk metadata_dict = load_object(metadata_filename) # Update the catalog with the properties read from the disk for property_name, property_value in six.iteritems(metadata_dict): if property_name == 'key': # If the property_name is key call set_key as the function # will check for the integrity of key before setting it in # the catalog cm.set_key(data_frame, property_value) else: cm.set_property(data_frame, property_name, property_value) else: # If the metadata file is not present then issue a warning logger.warning('There is no metadata file') # Return the DataFrame return data_frame
def validate_show_progress(self, show_progress): validate_object_type(show_progress, bool, 'Parameter show_progress')
def add_blackbox_feature(feature_table, feature_name, feature_function): """ Adds a black box feature to the feature table. Args: feature_table (DataFrame): The input DataFrame (typically a feature table) to which the feature must be added. feature_name (string): The name that should be given to the feature. feature_function (Python function): A Python function for the black box feature. Returns: A Boolean value of True is returned if the addition was successful. Raises: AssertionError: If the input `feature_table` is not of type DataFrame. AssertionError: If the input `feature_name` is not of type string. AssertionError: If the `feature_table` does not have necessary columns such as 'feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', and 'function_source' in the DataFrame. AssertionError: If the `feature_name` is already present in the feature table. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> block_f = em.get_features_for_blocking(A, B) >>> def age_diff(ltuple, rtuple): >>> # assume that the tuples have age attribute and values are valid numbers. >>> return ltuple['age'] - rtuple['age'] >>> status = em.add_blackbox_feature(block_f, 'age_difference', age_diff) """ # Validate input parameters # # We expect the feature_table to be of type pandas DataFrame validate_object_type(feature_table, pd.DataFrame, 'Input feature table') # # We expect the feature_name to be of type string validate_object_type(feature_name, six.string_types, 'Input feature name') # Check if the input feature table contains necessary columns dummy_feature_table = create_feature_table() if sorted(dummy_feature_table.columns) != sorted(feature_table.columns): logger.error('Input feature table does not have the necessary columns') raise AssertionError( 'Input feature table does not have the necessary columns') # Check if the feature table already contains the given feature name feat_names = list(feature_table['feature_name']) if feature_name in feat_names: logger.error('Input feature name is already present in feature table') raise AssertionError( 'Input feature name is already present in feature table') feature_dict = {} feature_dict['feature_name'] = feature_name feature_dict['function'] = feature_function feature_dict['left_attribute'] = None feature_dict['right_attribute'] = None feature_dict['left_attr_tokenizer'] = None feature_dict['right_attr_tokenizer'] = None feature_dict['simfunction'] = None feature_dict['function_source'] = None feature_dict['is_auto_generated'] = False # Add the feature to the feature table as a last entry. if len(feature_table) > 0: feature_table.loc[len(feature_table)] = feature_dict else: feature_table.columns = ['feature_name', 'left_attribute', 'right_attribute', 'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction', 'function', 'function_source', 'is_auto_generated'] feature_table.loc[len(feature_table)] = feature_dict # Finally return True if the addition was successful return True
def down_sample(table_a, table_b, size, y_param, show_progress=True, verbose=False, seed=None, rem_stop_words=True, rem_puncs=True, n_jobs=1): """ This function down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: table_a,table_b (DataFrame): The input tables A and B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). rem_stop_words (boolean): A flag to indicate whether a default set of stop words must be removed. rem_puncs (boolean): A flag to indicate whether the punctuations must be removed from the strings. n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. AssertionError: If `verbose` is not of type bool. AssertionError: If `show_progress` is not of type bool. AssertionError: If `n_jobs` is not of type int. Examples: >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1) """ if not isinstance(table_a, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError( 'Input table A is not of type pandas DataFrame') if not isinstance(table_b, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError( 'Input table B is not of type pandas DataFrame') if len(table_a) == 0 or len(table_b) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(table_b) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words') validate_object_type(rem_puncs, bool, 'Parameter rem_puncs') validate_object_type(n_jobs, int, 'Parameter n_jobs') # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # # get metadata # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger, # verbose) # # # # validate metadata # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger, # verbose) # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger, # verbose) # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have # good coverage in the down sampled A' and B'. s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs) # Randomly select size tuples from table B to be B' # If a seed value has been give, use a RandomState with the given seed b_sample_size = min(math.floor(size), len(table_b)) if seed is not None: rand = RandomState(seed) else: rand = RandomState() b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False)) n_jobs = get_num_procs(n_jobs, len(table_b)) sample_table_b = table_b.loc[b_tbl_indices] if n_jobs <= 1: # Probe inverted index to find all tuples in A that share tokens with tuples in B'. s_tbl_indices = _probe_index_split(sample_table_b, y_param, len(table_a), s_inv_index, show_progress, seed, rem_stop_words, rem_puncs) else: sample_table_splits = np.array_split(sample_table_b, n_jobs) results = Parallel(n_jobs=n_jobs)( delayed(_probe_index_split)(sample_table_splits[job_index], y_param, len(table_a), s_inv_index, (show_progress and (job_index == n_jobs - 1)), seed, rem_stop_words, rem_puncs) for job_index in range(n_jobs) ) results = map(list, results) s_tbl_indices = set(sum(results, [])) s_tbl_indices = list(s_tbl_indices) l_sampled = table_a.iloc[list(s_tbl_indices)] r_sampled = table_b.iloc[list(b_tbl_indices)] # update catalog if cm.is_dfinfo_present(table_a): cm.copy_properties(table_a, l_sampled) if cm.is_dfinfo_present(table_b): cm.copy_properties(table_b, r_sampled) return l_sampled, r_sampled
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors