def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) if validate: cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def _post_process_labelled_table(input_table, labeled_table, col_name): """ This function post processes the labeled table and updates the catalog. Specifically, this function validates that the label column contain only 0 and 1's, and finally copies the properties from the input table to the output table. """ # Cast the label values to int as initially they will be strings when it # comes from the GUI labeled_table[col_name] = labeled_table[col_name].astype(int) # Check if the table contains only 0s and 1s label_value_with_1 = labeled_table[col_name] == 1 label_value_with_0 = labeled_table[col_name] == 0 sum_of_labels = sum(label_value_with_1 | label_value_with_0) # If they contain column values other than 0 and 1, raise an error if not sum_of_labels == len(labeled_table): logger.error('The label column contains values other than 0 and 1') raise AssertionError( 'The label column contains values other than 0 and 1') # Copy the properties from the input table to label table. # Note: Here we dont have to check for the integrity of 'key' because the # key column is not tampered from the input table. cm.init_properties(labeled_table) cm.copy_properties(input_table, labeled_table) # Return the label table return labeled_table
def test_eval_matches_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'predicted', 'gold') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 14) self.assertEqual(result['prec_denominator'], 14) self.assertAlmostEqual(result['precision'], 1) self.assertEqual(result['recall_numerator'], 14) self.assertEqual(result['recall_denominator'], 15) self.assertEqual(result['recall'], 0.9333333333333333) self.assertEqual(result['f1'], 0.9655172413793104) self.assertEqual(result['pred_pos_num'], 14) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 1) self.assertEqual(result['false_neg_num'], 1.0) self.assertEqual(len(result['false_neg_ls']), 1) t = result['false_neg_ls'][0] self.assertEqual(t[0], 'a1') self.assertEqual(t[1], 'b1')
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0]*num_ones # gold.extend([1]*num_zeros) predicted = [1]* (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln+1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def test_eval_matches_valid_3(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = len(C1) num_zeros = len(C1) - num_ones gold = [0] * num_ones # gold.extend([1]*num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) D = pd.DataFrame(columns=C1.columns) cm.copy_properties(C, D) result = eval_matches(D, 'gold', 'predicted') self.assertEqual(isinstance(result, dict), True) self.assertEqual(result['prec_numerator'], 0) self.assertEqual(result['prec_denominator'], 0) self.assertAlmostEqual(result['precision'], 0) self.assertEqual(result['recall_numerator'], 0) self.assertEqual(result['recall_denominator'], 0) self.assertEqual(result['recall'], 0) self.assertEqual(result['f1'], 0) self.assertEqual(result['pred_pos_num'], 0) self.assertEqual(result['false_pos_num'], 0.0) self.assertEqual(len(result['false_pos_ls']), 0) self.assertEqual(result['pred_neg_num'], 0) self.assertEqual(result['false_neg_num'], 0.0) self.assertEqual(len(result['false_neg_ls']), 0)
def rename_col(df, old_col_name, new_col_name): new_df = df.rename(columns={old_col_name: new_col_name}) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(df, logger, False) if key == old_col_name: cm.set_key(new_df, new_col_name) elif fk_ltable == old_col_name: cm.set_fk_ltable(new_df, new_col_name) elif fk_rtable == old_col_name: cm.set_fk_rtable(new_df, new_col_name) else: pass else: key = cm.get_key(df) if key == old_col_name: cm.set_key(new_df, new_col_name) return new_df
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def test_copy_properties_update_false_2(self): A = read_csv_metadata(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1, replace=False) p = cm.get_all_properties(A) p1 = cm.get_all_properties(A1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(A1), cm.get_key(A))
def _predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, return_probs=False, probs_attr=None, inplace=True, copy_props=True): """ Delegated function from predict. """ # If x is not none, call the predict method that mimics sk-learn # predict method. if x is not None: y = self._predict_sklearn(x, return_prob=return_probs) # If the input table and the exclude attributes are not None, # then call the appropriate predict method. elif table is not None and exclude_attrs is not None: y = self._predict_ex_attrs(table, exclude_attrs, return_prob=return_probs) # If the append is True, update the table if target_attr is not None and append is True: # If inplace is True, then update the input table. if inplace: if return_probs: table[target_attr] = y[0] table[probs_attr] = y[1] # Return the updated table return table else: # Return the updated table table[target_attr] = y return table else: # else, create a copy and update it. table_copy = table.copy() if return_probs: table_copy[target_attr] = y[0] table_copy[probs_attr] = y[1] else: table_copy[target_attr] = y # copy the properties from the input table to the output # table. if copy_props: cm.copy_properties(table, table_copy) # Return the new table. return table_copy else: # else, raise a syntax error raise SyntaxError('The arguments supplied does not match ' 'the signatures supported !!!') # Return the predictions return y
def test_copy_properties_valid_1(self): A = read_csv_metadata(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1) self.assertEqual(cm.is_dfinfo_present(A1), True) p = cm.get_all_properties(A) p1 = cm.get_all_properties(A1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(A1), cm.get_key(A))
def filter_rows(df, condn): new_df = df.query(condn) # update metadata if cm.is_dfinfo_present(df): if _is_table_or_candset(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def mutate_col(df, **kwargs): new_df = df.assign(**kwargs) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) # if _is_table_or_candset(df): # key = cm.get_key(df) # if key == new_col_name: # cm.set_key(new_df, new_col_name) return new_df
def _predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, return_probs=False, probs_attr=None, inplace=True, copy_props=True): """ Delegated function from predict. """ # If x is not none, call the predict method that mimics sk-learn # predict method. if x is not None: y = self._predict_sklearn(x, return_prob=return_probs) # If the input table and the exclude attributes are not None, # then call the appropriate predict method. elif table is not None and exclude_attrs is not None: y = self._predict_ex_attrs(table, exclude_attrs, return_prob=return_probs) # If the append is True, update the table if target_attr is not None and append is True: # If inplace is True, then update the input table. if inplace: if return_probs: table[target_attr] = y[0] table[probs_attr] = y[1] # Return the updated table return table else: # Return the updated table table[target_attr] = y return table else: # else, create a copy and update it. table_copy = table.copy() if return_probs: table_copy[target_attr] = y[0] table_copy[probs_attr] = y[1] else: table_copy[target_attr] = y # copy the properties from the input table to the output # table. if copy_props: cm.copy_properties(table, table_copy) # Return the new table. return table_copy else: # else, raise a syntax error raise SyntaxError( 'The arguments supplied does not match ' 'the signatures supported !!!') # Return the predictions return y
def _test_label_table(self, table, col_name, label_values): _validate_inputs(table, col_name, verbose=False) lbl_table = _init_label_table(table, col_name) # mg._viewapp = None # from py_entitymatching.gui.table_gui import edit_table # edit_table(lbl_table, show_flag=False) # mg._viewapp = None new_table = lbl_table.copy() cm.copy_properties(table, new_table) lbl_table = new_table lbl_table[col_name] = label_values lbl_table = _post_process_labelled_table(table, lbl_table, col_name) return lbl_table
def test_copy_properties_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = pd.read_csv(path_c) cm.copy_properties(C, C1) self.assertEqual(cm.is_dfinfo_present(C1), True) p = cm.get_all_properties(C1) p1 = cm.get_all_properties(C1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(C1), cm.get_key(C)) self.assertEqual(cm.get_ltable(C1).equals(A), True) self.assertEqual(cm.get_rtable(C1).equals(B), True) self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C)) self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
def test_eval_matches_predicted_attr_not_in_df(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = C[['_id', 'ltable_ID', 'rtable_ID']] num_ones = 1 num_zeros = len(C1) - num_ones gold = [0] * num_ones gold.extend([1] * num_zeros) predicted = [1] * (num_zeros + num_ones) ln = len(C1.columns) C1.insert(ln, 'gold', gold) C1.insert(ln + 1, 'predicted', predicted) cm.copy_properties(C, C1) result = eval_matches(C1, 'gold', 'predicted1')
def drop_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable]) col_list = gh.list_drop_duplicates(col_list) else: key = cm.get_key(df) col_list = gh.list_diff(col_list, [key]) col_list = gh.list_drop_duplicates(col_list) new_df = df.drop(col_list, axis=1) cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def preserve_metadata(df, new_df): if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) if not ch.check_attrs_present(new_df, [key, fk_ltable, fk_rtable]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df else: key = cm.get_key(df) if not ch.check_attrs_present(new_df, [key]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def project_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) updated_col_list = [key, fk_ltable, fk_rtable] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) else: key = cm.get_key(df) updated_col_list = [key] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) new_df = df[col_list] cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, return_probs=False, probs_attr=None, inplace=True): """ Predict interface for the matcher. Specifically, there are two ways the user can call the predict method. First, interface similar to scikit-learn where the feature vectors given as projected DataFrame. Second, give the DataFrame and explicitly specify the feature vectors (by specifying the attributes to be excluded) . A point to note is all the input parameters have a default value of None. This is done to support both the interfaces in a single function. Args: x (DataFrame): The input pandas DataFrame containing only feature vectors (defaults to None). table (DataFrame): The input pandas DataFrame containing feature vectors, and may be other attributes (defaults to None). exclude_attrs (list): A list of attributes to be excluded from the input table to get the feature vectors (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). probs_attr (string): The attribute name where the prediction probabilities need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). Returns: An array of predictions or a DataFrame with predictions updated. """ # If x is not none, call the predict method that mimics sk-learn # predict method. if x is not None: y = self._predict_sklearn(x, return_prob=return_probs) # If the input table and the exclude attributes are not None, # then call the appropriate predict method. elif table is not None and exclude_attrs is not None: y = self._predict_ex_attrs(table, exclude_attrs, return_prob=return_probs) # If the append is True, update the table if target_attr is not None and append is True: # If inplace is True, then update the input table. if inplace: if return_probs: table[target_attr] = y[0] table[probs_attr] = y[1] # Return the updated table return table else: # Return the updated table table[target_attr] = y return table else: # else, create a copy and update it. table_copy = table.copy() if return_probs: table_copy[target_attr] = y[0] table_copy[probs_attr] = y[1] else: table_copy[target_attr] = y # copy the properties from the input table to the output # table. cm.copy_properties(table, table_copy) # Return the new table. return table_copy else: # else, raise a syntax error raise SyntaxError( 'The arguments supplied does not match ' 'the signatures supported !!!') # Return the predictions return y
def split_train_test(labeled_data, train_proportion=0.5, random_state=None, verbose=True): """ This function splits the input data into train and test. Specifically, this function is just a wrapper of scikit-learn's train_test_split function. This function also takes care of copying the metadata from the input table to train and test splits. Args: labeled_data (DataFrame): The input pandas DataFrame that needs to be split into train and test. train_proportion (float): A number between 0 and 1, indicating the proportion of tuples that should be included in the train split ( defaults to 0.5). random_state (object): A number of random number object (as in scikit-learn). verbose (boolean): A flag to indicate whether the debug information should be displayed. Returns: A Python dictionary containing two keys - train and test. The value for the key 'train' is a pandas DataFrame containing tuples allocated from the input table based on train_proportion. Similarly, the value for the key 'test' is a pandas DataFrame containing tuples for evaluation. This function sets the output DataFrames (train, test) properties same as the input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data or the feature vectors that should be split >>> train_test = em.split_train_test(G, train_proportion=0.5) >>> train, test = train_test['train'], train_test['test'] """ # Validate input parameters # # We expected labeled data to be of type pandas DataFrame if not isinstance(labeled_data, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( labeled_data, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) num_rows = len(labeled_data) # We expect the train proportion to be between 0 and 1. assert train_proportion >= 0 and train_proportion <= 1, \ " Train proportion is expected to be between 0 and 1" # We expect the number of rows in the table to be non-empty assert num_rows > 0, 'The input table is empty' # Explicitly get the train and test size in terms of tuples (based on the # given proportion) train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) # Use sk-learn to split the data idx_values = pd.np.array(labeled_data.index.values) idx_train, idx_test = ms.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # Construct output tables. label_train = labeled_data.ix[idx_train] label_test = labeled_data.ix[idx_test] # Update catalog cm.init_properties(label_train) cm.copy_properties(labeled_data, label_train) cm.init_properties(label_test) cm.copy_properties(labeled_data, label_test) # Return output tables result = OrderedDict() result['train'] = label_train result['test'] = label_test # Finally, return the dictionary. return result
def impute_table(table, exclude_attrs=None, missing_val='NaN', strategy='mean', axis=0, val_all_nans=0, verbose=True): """ Impute table containing missing values. Args: table (DataFrame): DataFrame which values should be imputed. exclude_attrs (List) : list of attribute names to be excluded from imputing (defaults to None). missing_val (string or int): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value 'NaN' (defaults to 'NaN'). strategy (string): String that specifies on how to impute values. Valid strings: 'mean', 'median', 'most_frequent' (defaults to 'mean'). axis (int): axis=1 along rows, and axis=0 along columns (defaults to 0). val_all_nans (float): Value to fill in if all the values in the column are NaN. Returns: Imputed DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # H is the feature vector which should be imputed. Specifically, impute the missing values >>> # in each column, with the mean of that column >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') """ # Validate input paramaters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) fv_columns = table.columns if exclude_attrs == None: feature_names = fv_columns else: # Check if the exclude attributes are present in the input table if not ch.check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] # print feature_names table_copy = table.copy() projected_table = table_copy[feature_names] projected_table_values = projected_table.values imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis) imp.fit(projected_table_values) imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans projected_table_values = imp.transform(projected_table_values) table_copy[feature_names] = projected_table_values # Update catalog cm.init_properties(table_copy) cm.copy_properties(table, table_copy) return table_copy
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def test_copy_properties_src_df_notin_catalog(self): A = pd.read_csv(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1)
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def test_copy_properties_invalid_src_df(self): A = read_csv_metadata(path_a) cm.copy_properties(None, A)
def test_copy_properties_invalid_tar_df(self): A = read_csv_metadata(path_a) cm.copy_properties(A, None)
def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, return_probs=False, probs_attr=None, inplace=True, show_progress=False, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Predict interface for the matcher. Specifically, there are two ways the user can call the predict method. First, interface similar to scikit-learn where the feature vectors given as projected DataFrame. Second, give the DataFrame and explicitly specify the feature vectors (by specifying the attributes to be excluded) . A point to note is all the input parameters have a default value of None. This is done to support both the interfaces in a single function. Currently, the Dask implementation supports only the cases when the table is not None and the flags inplace, append are False. Args: x (DataFrame): The input pandas DataFrame containing only feature vectors (defaults to None). table (DataFrame): The input pandas DataFrame containing feature vectors, and may be other attributes (defaults to None). exclude_attrs (list): A list of attributes to be excluded from the input table to get the feature vectors (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). probs_attr (string): The attribute name where the prediction probabilities need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: An array of predictions or a DataFrame with predictions updated. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") if x is not None: return self._predict(x, table, exclude_attrs, target_attr, append, return_probs, probs_attr, inplace) else: n_chunks = get_num_partitions(n_chunks, len(table)) if n_chunks == 1 or inplace == True or append == False: # When the inplace flag is True, the predictions (and probs) are added # in place. If he have to use Dask then we have to modify _predict ( # specifically _predict_sk_learn) function. # So, to keep things simple, we support Dask only when # inplace=False # Similarly, when append=False, the return value from _predict will be # different for different cases (for example, when return_probs is True # or False). If we have to use Dask then we have to careful in # recording the return values for each chunk. # So, to keep things simple, we support Dask only when # append=True result = self._predict(table=table, exclude_attrs=exclude_attrs, target_attr=target_attr, append=append, return_probs=return_probs, probs_attr=probs_attr, inplace=inplace, copy_props=True) else: predicted_results = [] splitted_tables = pd.np.array_split(table, n_chunks) for i in range(len(splitted_tables)): partial_result = delayed(self._predict)(table=splitted_tables[i], exclude_attrs=exclude_attrs, target_attr=target_attr, append=append, return_probs=return_probs, probs_attr=probs_attr, inplace=inplace, copy_props=False) predicted_results.append(partial_result) predicted_results = delayed(wrap)(predicted_results) if show_progress: with ProgressBar(): predicted_results = predicted_results.compute( scheduler="processes", num_workers=get_num_cores()) else: predicted_results = predicted_results.compute( scheduler="processes", num_workers=get_num_cores()) result = pd.concat(predicted_results) cm.copy_properties(table, result) return result
def dask_down_sample(ltable, rtable, size, y_param, show_progress=True, verbose=False, seed=None, rem_stop_words=True, rem_puncs=True, n_ltable_chunks=1, n_sample_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. This command down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: ltable (DataFrame): The left input table, i.e., table A. rtable (DataFrame): The right input table, i.e., table B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). rem_stop_words (boolean): A flag to indicate whether a default set of stop words must be removed. rem_puncs (boolean): A flag to indicate whether the punctuations must be removed from the strings. n_ltable_chunks (int): The number of partitions for ltable (defaults to 1). If it is set to -1, the number of partitions will be set to the number of cores in the machine. n_sample_rtable_chunks (int): The number of partitions for the sampled rtable (defaults to 1) Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. AssertionError: If `verbose` is not of type bool. AssertionError: If `show_progress` is not of type bool. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_sample_rtable_chunks` is not of type int. Examples: >>> from py_entitymatching.dask.dask_down_sample import dask_down_sample >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, n_ltable_chunks=-1, n_sample_rtable_chunks=-1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = dask_down_sample(A, B, 500, 1, seed=0, n_ltable_chunks=-1, n_sample_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # validation checks if not isinstance(ltable, pd.DataFrame): logger.error('Input table A (ltable) is not of type pandas DataFrame') raise AssertionError( 'Input table A (ltable) is not of type pandas DataFrame') if not isinstance(rtable, pd.DataFrame): logger.error('Input table B (rtable) is not of type pandas DataFrame') raise AssertionError( 'Input table B (rtable) is not of type pandas DataFrame') if len(ltable) == 0 or len(rtable) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(rtable) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words') validate_object_type(rem_puncs, bool, 'Parameter rem_puncs') validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_sample_rtable_chunks, int, 'Parameter n_sample_rtable_chunks') rtable_sampled = sample_right_table(rtable, size, seed) ltbl_str_cols = _get_str_cols_list(ltable) proj_ltable = ltable[ltable.columns[ltbl_str_cols]] if n_ltable_chunks == -1: n_ltable_chunks = get_num_cores() ltable_chunks = pd.np.array_split(proj_ltable, n_ltable_chunks) preprocessed_tokenized_tbl = [] # Use Dask to preprocess and tokenize strings. start_row_id = 0 for i in range(len(ltable_chunks)): # start_row_id is internally used by process_tokenize_concat strings to map # each to its row id in the ltable. result = delayed(process_tokenize_concat_strings)(ltable_chunks[i], start_row_id, rem_puncs, rem_stop_words) preprocessed_tokenized_tbl.append(result) # update start_row_id start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl) # Now execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute( scheduler="processes", num_workers=get_num_cores()) else: preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute( scheduler="processes", num_workers=get_num_cores()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_tbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i]) # Build an inverted index inverted_index = build_inverted_index(ltable_processed_dict) # Preprocess/tokenize sampled rtable and probe rtbl_str_cols = _get_str_cols_list(rtable_sampled) proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]] if n_sample_rtable_chunks == -1: n_sample_rtable_chunks = get_num_cores() rtable_chunks = pd.np.array_split(proj_rtable_sampled, n_sample_rtable_chunks) probe_result = [] # Create the DAG for i in range(len(rtable_chunks)): result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable), inverted_index, rem_puncs, rem_stop_words, seed) probe_result.append(result) probe_result = delayed(wrap)(probe_result) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) probe_result = map(list, probe_result) l_tbl_indices = set(sum(probe_result, [])) l_tbl_indices = list(l_tbl_indices) ltable_sampled = ltable.iloc[l_tbl_indices] # update catalog if cm.is_dfinfo_present(ltable): cm.copy_properties(ltable, ltable_sampled) if cm.is_dfinfo_present(rtable): cm.copy_properties(rtable, rtable_sampled) return ltable_sampled, rtable_sampled
def predict(self, x=None, table=None, exclude_attrs=None, target_attr=None, append=False, return_probs=False, probs_attr=None, inplace=True, show_progress=False, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Predict interface for the matcher. Specifically, there are two ways the user can call the predict method. First, interface similar to scikit-learn where the feature vectors given as projected DataFrame. Second, give the DataFrame and explicitly specify the feature vectors (by specifying the attributes to be excluded) . A point to note is all the input parameters have a default value of None. This is done to support both the interfaces in a single function. Currently, the Dask implementation supports only the cases when the table is not None and the flags inplace, append are False. Args: x (DataFrame): The input pandas DataFrame containing only feature vectors (defaults to None). table (DataFrame): The input pandas DataFrame containing feature vectors, and may be other attributes (defaults to None). exclude_attrs (list): A list of attributes to be excluded from the input table to get the feature vectors (defaults to None). target_attr (string): The attribute name where the predictions need to be stored in the input table (defaults to None). probs_attr (string): The attribute name where the prediction probabilities need to be stored in the input table (defaults to None). append (boolean): A flag to indicate whether the predictions need to be appended in the input DataFrame (defaults to False). return_probs (boolean): A flag to indicate where the prediction probabilities need to be returned (defaults to False). If set to True, returns the probability if the pair was a match. inplace (boolean): A flag to indicate whether the append needs to be done inplace (defaults to True). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: An array of predictions or a DataFrame with predictions updated. """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.") if x is not None: return self._predict(x, table, exclude_attrs, target_attr, append, return_probs, probs_attr, inplace) else: n_chunks = get_num_partitions(n_chunks, len(table)) if n_chunks == 1 or inplace == True or append == False: # When the inplace flag is True, the predictions (and probs) are added # in place. If he have to use Dask then we have to modify _predict ( # specifically _predict_sk_learn) function. # So, to keep things simple, we support Dask only when # inplace=False # Similarly, when append=False, the return value from _predict will be # different for different cases (for example, when return_probs is True # or False). If we have to use Dask then we have to careful in # recording the return values for each chunk. # So, to keep things simple, we support Dask only when # append=True result = self._predict(table=table, exclude_attrs=exclude_attrs, target_attr=target_attr, append=append, return_probs=return_probs, probs_attr=probs_attr, inplace=inplace, copy_props=True) else: predicted_results = [] splitted_tables = np.array_split(table, n_chunks) for i in range(len(splitted_tables)): partial_result = delayed(self._predict)(table=splitted_tables[i], exclude_attrs=exclude_attrs, target_attr=target_attr, append=append, return_probs=return_probs, probs_attr=probs_attr, inplace=inplace, copy_props=False) predicted_results.append(partial_result) predicted_results = delayed(wrap)(predicted_results) if show_progress: with ProgressBar(): predicted_results = predicted_results.compute( scheduler="processes", num_workers=get_num_cores()) else: predicted_results = predicted_results.compute( scheduler="processes", num_workers=get_num_cores()) result = pd.concat(predicted_results) cm.copy_properties(table, result) return result
def test_copy_properties_update_false_1(self): A = read_csv_metadata(path_a) A1 = read_csv_metadata(path_a) status=cm.copy_properties(A, A1, replace=False) self.assertEqual(status, False)
def down_sample(table_a, table_b, size, y_param, show_progress=True, verbose=False, seed=None): """ This function down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: table_a,table_b (DataFrame): The input tables A and B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. Examples: >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0) """ if not isinstance(table_a, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError( 'Input table A is not of type pandas DataFrame') if not isinstance(table_b, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError( 'Input table B is not of type pandas DataFrame') if len(table_a) == 0 or len(table_b) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(table_b) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # # get metadata # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger, # verbose) # # # # validate metadata # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger, # verbose) # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger, # verbose) # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have # good coverage in the down sampled A' and B'. s_inv_index = _inv_index(table_a) # Randomly select size tuples from table B to be B' # If a seed value has been give, use a RandomState with the given seed b_sample_size = min(math.floor(size), len(table_b)) if seed is not None: rand = RandomState(seed) else: rand = RandomState() b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False)) # Probe inverted index to find all tuples in A that share tokens with tuples in B'. s_tbl_indices = _probe_index(table_b.ix[b_tbl_indices], y_param, len(table_a), s_inv_index, show_progress, seed=seed) s_tbl_indices = list(s_tbl_indices) l_sampled = table_a.iloc[list(s_tbl_indices)] r_sampled = table_b.iloc[list(b_tbl_indices)] # update catalog if cm.is_dfinfo_present(table_a): cm.copy_properties(table_a, l_sampled) if cm.is_dfinfo_present(table_b): cm.copy_properties(table_b, r_sampled) return l_sampled, r_sampled
def down_sample(table_a, table_b, size, y_param, show_progress=True, verbose=False, seed=None, rem_stop_words=True, rem_puncs=True, n_jobs=1): """ This function down samples two tables A and B into smaller tables A' and B' respectively. Specifically, first it randomly selects `size` tuples from the table B to be table B'. Next, it builds an inverted index I (token, tuple_id) on table A. For each tuple x ∈ B', the algorithm finds a set P of k/2 tuples from I that match x, and a set Q of k/2 tuples randomly selected from A - P. The idea is for A' and B' to share some matches yet be as representative of A and B as possible. Args: table_a,table_b (DataFrame): The input tables A and B. size (int): The size that table B should be down sampled to. y_param (int): The parameter to control the down sample size of table A. Specifically, the down sampled size of table A should be close to size * y_param. show_progress (boolean): A flag to indicate whether a progress bar should be displayed (defaults to True). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). seed (int): The seed for the pseudo random number generator to select the tuples from A and B (defaults to None). rem_stop_words (boolean): A flag to indicate whether a default set of stop words must be removed. rem_puncs (boolean): A flag to indicate whether the punctuations must be removed from the strings. n_jobs (int): The number of parallel jobs to be used for computation (defaults to 1). If -1 all CPUs are used. If 0 or 1, no parallel computation is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used (where n_cpus is the total number of CPUs in the machine). Thus, for n_jobs = -2, all CPUs but one are used. If (n_cpus + 1 + n_jobs) is less than 1, then no parallel computation is used (i.e., equivalent to the default). Returns: Down sampled tables A and B as pandas DataFrames. Raises: AssertionError: If any of the input tables (`table_a`, `table_b`) are empty or not a DataFrame. AssertionError: If `size` or `y_param` is empty or 0 or not a valid integer value. AssertionError: If `seed` is not a valid integer value. AssertionError: If `verbose` is not of type bool. AssertionError: If `show_progress` is not of type bool. AssertionError: If `n_jobs` is not of type int. Examples: >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, n_jobs=-1) # Example with seed = 0. This means the same sample data set will be returned # each time this function is run. >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> sample_A, sample_B = em.down_sample(A, B, 500, 1, seed=0, n_jobs=-1) """ if not isinstance(table_a, pd.DataFrame): logger.error('Input table A is not of type pandas DataFrame') raise AssertionError( 'Input table A is not of type pandas DataFrame') if not isinstance(table_b, pd.DataFrame): logger.error('Input table B is not of type pandas DataFrame') raise AssertionError( 'Input table B is not of type pandas DataFrame') if len(table_a) == 0 or len(table_b) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') if size == 0 or y_param == 0: logger.error( 'size or y cannot be zero (3rd and 4th parameter of downsample)') raise AssertionError( 'size or y_param cannot be zero (3rd and 4th parameter of downsample)') if seed is not None and not isinstance(seed, int): logger.error('Seed is not of type integer') raise AssertionError('Seed is not of type integer') if len(table_b) < size: logger.warning( 'Size of table B is less than b_size parameter - using entire table B') validate_object_type(verbose, bool, 'Parameter verbose') validate_object_type(show_progress, bool, 'Parameter show_progress') validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words') validate_object_type(rem_puncs, bool, 'Parameter rem_puncs') validate_object_type(n_jobs, int, 'Parameter n_jobs') # get and validate required metadata log_info(logger, 'Required metadata: ltable key, rtable key', verbose) # # # get metadata # l_key, r_key = cm.get_keys_for_ltable_rtable(table_a, table_b, logger, # verbose) # # # # validate metadata # cm._validate_metadata_for_table(table_a, l_key, 'ltable', logger, # verbose) # cm._validate_metadata_for_table(table_b, r_key, 'rtable', logger, # verbose) # Inverted index built on table A will consist of all tuples in such P's and Q's - central idea is to have # good coverage in the down sampled A' and B'. s_inv_index = _inv_index(table_a, rem_stop_words, rem_puncs) # Randomly select size tuples from table B to be B' # If a seed value has been give, use a RandomState with the given seed b_sample_size = min(math.floor(size), len(table_b)) if seed is not None: rand = RandomState(seed) else: rand = RandomState() b_tbl_indices = list(rand.choice(len(table_b), int(b_sample_size), replace=False)) n_jobs = get_num_procs(n_jobs, len(table_b)) sample_table_b = table_b.loc[b_tbl_indices] if n_jobs <= 1: # Probe inverted index to find all tuples in A that share tokens with tuples in B'. s_tbl_indices = _probe_index_split(sample_table_b, y_param, len(table_a), s_inv_index, show_progress, seed, rem_stop_words, rem_puncs) else: sample_table_splits = np.array_split(sample_table_b, n_jobs) results = Parallel(n_jobs=n_jobs)( delayed(_probe_index_split)(sample_table_splits[job_index], y_param, len(table_a), s_inv_index, (show_progress and (job_index == n_jobs - 1)), seed, rem_stop_words, rem_puncs) for job_index in range(n_jobs) ) results = map(list, results) s_tbl_indices = set(sum(results, [])) s_tbl_indices = list(s_tbl_indices) l_sampled = table_a.iloc[list(s_tbl_indices)] r_sampled = table_b.iloc[list(b_tbl_indices)] # update catalog if cm.is_dfinfo_present(table_a): cm.copy_properties(table_a, l_sampled) if cm.is_dfinfo_present(table_b): cm.copy_properties(table_b, r_sampled) return l_sampled, r_sampled
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table