def _is_table_or_candset(df): table_props = ['key'] candset_props = ['key', 'fk_ltable', 'fk_rtable', 'ltable', 'rtable'] properties = cm.get_all_properties(df) keys = list(properties) if len(gh.list_diff(keys, table_props)) == 0: return True elif len(gh.list_diff(keys, candset_props)) == 0: return True else: return False
def test_select_matcher_valid_2(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = feature_vectors[l] Y = feature_vectors['gold'] result = select_matcher(matchers, x=X, y=Y) header = ['Name', 'Matcher', 'Num folds'] result_df = result['drill_down_cv_stats']['precision'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.loc[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def test_select_matcher_target_attr_not_series(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [ dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher ] col_list = list(feature_vectors.columns) l = list_diff(col_list, [ cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold' ]) X = feature_vectors[l] Y = feature_vectors[['gold']] result = select_matcher(matchers, x=X, y=Y)
def _predict_ex_attrs(self, table, exclude_attrs, return_prob=False): """ Variant of predict method, where data is derived based on exclude attributes. """ # Validate input parameters # # We expect input table to be a pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') # # We expect the exclude attributes to be a list, if not convert it # into a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the input table contains the attributes to be excluded. If # not raise an error. if not ch.check_attrs_present(table, exclude_attrs): logger.error( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') # Get the attributes to project. attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs) # Get feature vectors and the target attribute x = table[attributes_to_project] # Do the predictions and return the probabilities (if required) res = self._predict_sklearn(x, check_rem=False, return_prob=return_prob) return res
def test_select_matcher_target_attr_not_present(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors) ]) feature_vectors = feature_vectors[l] result = select_matcher(matchers, x=None, y=None, table=feature_vectors, exclude_attrs='_id', target_attr='labels1', k=2)
def test_select_matcher_valid_2(self): A = read_csv_metadata(path_a, key='id') B = read_csv_metadata(path_b, key='id') # C = read_csv_metadata(path_c, ltable=A, rtable=B, fk_ltable='ltable.id', # fk_rtable='rtable.id', key='_id') # labels = [0] * 7 # labels.extend([1] * 8) # C['labels'] = labels # feature_table = get_features_for_matching(A, B) # feature_vectors = extract_feature_vecs(C, feature_table=feature_table, attrs_after='gold') # feature_vectors.fillna(0, inplace=True) feature_vectors = read_csv_metadata(path_f, ltable=A, rtable=B) dtmatcher = DTMatcher() nbmatcher = NBMatcher() rfmatcher = RFMatcher() svmmatcher = SVMMatcher() linregmatcher = LinRegMatcher() logregmatcher = LogRegMatcher() matchers = [dtmatcher, nbmatcher, rfmatcher, svmmatcher, linregmatcher, logregmatcher] col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = feature_vectors[l] Y = feature_vectors['gold'] result = select_matcher(matchers, x=X, y=Y) header = ['Name', 'Matcher', 'Num folds'] result_df = result['drill_down_cv_stats']['precision'] self.assertEqual(set(header) == set(list(result_df.columns[[0, 1, 2]])), True) self.assertEqual('Mean score', result_df.columns[len(result_df.columns) - 1]) d = result_df.set_index('Name') p_max = d.ix[result['selected_matcher'].name, 'Mean score'] a_max = pd.np.max(d['Mean score']) self.assertEqual(p_max, a_max)
def _is_table(df): table_props = ['key'] properties = cm.get_all_properties(df) keys = list(properties) if len(gh.list_diff(keys, table_props)) == 0: return True else: return False
def drop_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable]) col_list = gh.list_drop_duplicates(col_list) else: key = cm.get_key(df) col_list = gh.list_diff(col_list, [key]) col_list = gh.list_drop_duplicates(col_list) new_df = df.drop(col_list, axis=1) cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def _fit_ex_attrs(self, table, exclude_attrs, target_attr): """ This function supports the fit method, where the DataFrame can be given as input along with what attributes must be excluded and the target attribute. """ # Validate the input parameters. # # We expect the input table to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') # Convert the exclude attributes into list (if the input is not of list) if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table. If # not, raise an error. if not ch.check_attrs_present(table, exclude_attrs): logger.error( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs is not present ' \ 'in the input table') # Check if the target attribute is present in the input table. If # not, raise an error. if not ch.check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # We now remove duplicate attributes from the exclude_attrs exclude_attrs = gh.list_drop_duplicates(exclude_attrs) # We explicitly append target attribute to exclude attributes if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Now, we get the attributes to project attributes_to_project = gh.list_diff(list(table.columns), exclude_attrs) # Get the predictors and the target attribute from the input table # based on the exclude attrs and the target attribute. x = table[attributes_to_project] y = table[target_attr] self._fit_sklearn(x, y, check_rem=False)
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError( logger.error('Input table is not of type dataframe')) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def test_ml_matcher_valid_2(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [cm.get_key(feature_vectors), cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold']) X = train[l] Y = train['gold'] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def test_ml_matcher_valid_with_id_in_y(self): A = read_csv_metadata(fpath_a, key='id') B = read_csv_metadata(fpath_b, key='id') feature_vectors = read_csv_metadata(fpath_f, ltable=A, rtable=B) train_test = mu.split_train_test(feature_vectors) train, test = train_test['train'], train_test['test'] dt = DTMatcher(name='DecisionTree') col_list = list(feature_vectors.columns) l = list_diff(col_list, [ cm.get_fk_ltable(feature_vectors), cm.get_fk_rtable(feature_vectors), 'gold' ]) X = train[l] Y = train[['_id', 'gold']] dt.fit(x=X, y=Y) predictions = dt.predict(test[l]) self.assertEqual(len(predictions), len(test))
def _get_xy_data_ex(table, exclude_attrs, target_attr): # Validate the input parameters # # We expect the input table to be of type pandas DataFrame validate_object_type(table, pd.DataFrame) # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Check if the exclude attributes are present in the input table if not check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # Check if the target attribute is present in the input table if not check_attrs_present(table, target_attr): logger.error('The target_attr is not present in the input table') raise AssertionError( 'The target_attr is not present in the input table') # Drop the duplicates from the exclude attributes exclude_attrs = list_drop_duplicates(exclude_attrs) # Explicitly add the target attribute to exclude attribute (if it is not # already present) if target_attr not in exclude_attrs: exclude_attrs.append(target_attr) # Project the list of attributes that should be used for scikit-learn's # functions. attrs_to_project = list_diff(list(table.columns), exclude_attrs) # Get the values for x x = table[attrs_to_project].values # Get the values for x y = table[target_attr].values y = y.ravel() # to mute warnings from svm and cross validation # Return x and y return x, y
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1, FeatureExtractor=ParallelFeatureExtractor): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # (Matt) Stage 1: Input validation # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # We expect the FeatureExtractor class to be of type BaseFeatureExtractor validate_subclass(FeatureExtractor, BaseFeatureExtractor, error_prefix='Input FeatureExtractor') # (Matt) The two blocks below are making sure that attributes that are to be appended # to this function's output do in fact exist in the input DataFrame # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # (Matt) Why not make sure that this is a DataFrame instead of just nonempty? # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # (Matt) ch ~ catalog helper # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) # (Matt) cm ~ catalog manager key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # (Matt) ParallelFeatureExtractor implementation starts here # # Apply feature functions feature_extractor = FeatureExtractor( feature_table, n_jobs=n_jobs, verbose=verbose, show_progress=show_progress ) feat_vals = feature_extractor.extract_from(candset) # (Matt) ParallelFeatureExtractor implementation ends here; the rest is formatting # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def combine_blocker_outputs_via_union(blocker_output_list, l_prefix='ltable_', r_prefix='rtable_', verbose=False): """ Combines multiple blocker outputs by doing a union of their tuple pair ids (foreign key ltable, foreign key rtable). Specifically, this function takes in a list of DataFrames (candidate sets, typically the output from blockers) and returns a consolidated DataFrame. The output DataFrame contains the union of tuple pair ids (foreign key ltable, foreign key rtable) and other attributes from the input list of DataFrames. This function makes some assumptions about the input DataFrames. First, each DataFrame is expected to contain the following metadata in the catalog: key, fk_ltable, fk_rtable, ltable, and rtable. Second, all the DataFrames must be a result of blocking from the same underlying tables. Concretely the ltable and rtable properties must refer to the same DataFrame across all the input tables. Third, all the input DataFrames must have the same fk_ltable and fk_rtable properties. Finally, in each input DataFrame, for the attributes included from the ltable or rtable, the attribute names must be prefixed with the given l_prefix and r_prefix in the function. The input DataFrames may contain different attribute lists and it demands the question of how to combine them. Currently py_entitymatching takes an union of attribute names that has prefix l_prefix or r_prefix across input tables. After taking the union, for each tuple id pair included in output, the attribute values (for union-ed attribute names) are probed from ltable/rtable and included in the output. A subtle point to note here is, if an input DataFrame has a column added by user (say label for some reason), then that column will not be present in the output. The reason is, the same column may not be present in other candidate sets so it is not clear about how to combine them. One possibility is to include label in output for all tuple id pairs, but set as NaN for the values not present. Currently py_entitymatching does not include such columns and addressing it will be part of future work. Args: blocker_output_list (list of DataFrames): The list of DataFrames that should be combined. l_prefix (string): The prefix given to the attributes from the ltable. r_prefix (string): The prefix given to the attributes from the rtable. verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with the combined tuple pairs and other attributes from all the blocker lists. Raises: AssertionError: If `l_prefix` is not of type string. AssertionError: If `r_prefix` is not of type string. AssertionError: If the length of the input DataFrame list is 0. AssertionError: If `blocker_output_list` is not a list of DataFrames. AssertionError: If the ltables are different across the input list of DataFrames. AssertionError: If the rtables are different across the input list of DataFrames. AssertionError: If the `fk_ltable` values are different across the input list of DataFrames. AssertionError: If the `fk_rtable` values are different across the input list of DataFrames. """ # validate input parameters # The l_prefix is expected to be of type string if not isinstance(l_prefix, six.string_types): logger.error('l_prefix is not of type string') raise AssertionError('l_prefix is not of type string') # The r_prefix is expected to be of type string if not isinstance(r_prefix, six.string_types): logger.error('r_prefix is not of type string') raise AssertionError('r_prefix is not of type string') # We cannot combine empty DataFrame list if not len(blocker_output_list) > 0: logger.error('There no DataFrames to combine') raise AssertionError('There are no DataFrames to combine') # Validate the assumptions about the input tables. # # 1) All the input object must be DataFrames # # 2) All the input DataFrames must have the metadata as that of a # candidate set # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable _validate_lr_tables(blocker_output_list) # # Get the ltable and rtable. We take it from the first DataFrame as all # the DataFrames contain the same ltables and rtables ltable = cm.get_ltable(blocker_output_list[0]) rtable = cm.get_rtable(blocker_output_list[0]) # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as # all the DataFrames contain the same ltables and rtables fk_ltable = cm.get_fk_ltable(blocker_output_list[0]) fk_rtable = cm.get_fk_rtable(blocker_output_list[0]) # Retrieve the keys for the ltable and rtables. l_key = cm.get_key(ltable) r_key = cm.get_key(rtable) # Check if the fk_ltable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_ltable.startswith(l_prefix) is False: logger.warning( 'Foreign key for ltable is not starting with the given prefix (' '%s)', l_prefix) # Check if the fk_rtable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_rtable.startswith(r_prefix) is False: logger.warning( 'Foreign key for rtable is not starting with the given prefix (' '%s)', r_prefix) # Initialize lists # # keep track of projected tuple pair ids tuple_pair_ids = [] # # keep track of output attributes from the left table l_output_attrs = [] # # keep track of output attributes from the right table r_output_attrs = [] # for each DataFrame in the given list, project out tuple pair ids, get the # attributes from the ltable and rtable for data_frame in blocker_output_list: # Project out the tuple pair ids. A tuple pair id is a fk_ltable, # fk_rtable pair projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]] # Update the list that tracks tuple pair ids tuple_pair_ids.append(projected_tuple_pair_ids) # Get the columns, which should be segregated into the attributes # from the ltable and table col_set = (gh.list_diff(list(data_frame.columns), [fk_ltable, fk_rtable, cm.get_key(data_frame)])) # Segregate the columns as attributes from the ltable and rtable l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix) # Update the l_output_attrs, r_output_attrs l_output_attrs.extend(l_attrs) # the reason we use extend because l_attrs a list r_output_attrs.extend(r_attrs) ch.log_info( logger, 'Concatenating the tuple pair ids across given ' 'blockers ...', verbose) # concatenate the tuple pair ids from the list of input DataFrames concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids) ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose) ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose) # Deduplicate the DataFrame. Now the returned DataFrame will contain # unique tuple pair ids. # noinspection PyUnresolvedReferences deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates() ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose) # Construct output table # # Get unique list of attributes across different tables l_output_attrs = gh.list_drop_duplicates(l_output_attrs) r_output_attrs = gh.list_drop_duplicates(r_output_attrs) # Reset the index that might have lingered from concatenation. deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True) # Add the output attribtues from the ltable and rtable. # NOTE: This approach may be inefficient as it probes the ltable, rtable # to get the attribute values. A better way would be to fill the # attribute values from the input list of DataFrames. This attribute values # could be harvested (at the expense of some space) while we iterate the # input blocker output list for the first time. # noinspection PyProtectedMember consolidated_data_frame = gh._add_output_attributes( deduplicated_tuple_pair_ids, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_prefix, r_prefix, validate=False) # Sort the DataFrame ordered by fk_ltable and fk_rtable. # The function "sort" will be depreciated in the newer versions of # pandas DataFrame, and it will replaced by 'sort_values' function. So we # will first try to use sort_values if this fails we will use sort. try: consolidated_data_frame.sort_values([fk_ltable, fk_rtable], inplace=True) except AttributeError: consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True) # update the catalog for the consolidated DataFrame # First get a column name for the key key = ch.get_name_for_key(consolidated_data_frame.columns) # Second, add the column name as the key consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key) # Third, reset the index to remove any out of order index values from # the sort. consolidated_data_frame.reset_index(inplace=True, drop=True) # Finally, set the properties for the consolidated DataFrame in the catalog cm.set_candset_properties(consolidated_data_frame, key, fk_ltable, fk_rtable, ltable, rtable) # Return the consolidated DataFrame return consolidated_data_frame
def combine_blocker_outputs_via_union( blocker_output_list, l_prefix='ltable_', r_prefix='rtable_', verbose=False): """ Combines multiple blocker outputs by doing a union of their tuple pair ids (foreign key ltable, foreign key rtable). Specifically, this function takes in a list of DataFrames (candidate sets, typically the output from blockers) and returns a consolidated DataFrame. The output DataFrame contains the union of tuple pair ids (foreign key ltable, foreign key rtable) and other attributes from the input list of DataFrames. This function makes some assumptions about the input DataFrames. First, each DataFrame is expected to contain the following metadata in the catalog: key, fk_ltable, fk_rtable, ltable, and rtable. Second, all the DataFrames must be a result of blocking from the same underlying tables. Concretely the ltable and rtable properties must refer to the same DataFrame across all the input tables. Third, all the input DataFrames must have the same fk_ltable and fk_rtable properties. Finally, in each input DataFrame, for the attributes included from the ltable or rtable, the attribute names must be prefixed with the given l_prefix and r_prefix in the function. The input DataFrames may contain different attribute lists and it demands the question of how to combine them. Currently py_entitymatching takes an union of attribute names that has prefix l_prefix or r_prefix across input tables. After taking the union, for each tuple id pair included in output, the attribute values (for union-ed attribute names) are probed from ltable/rtable and included in the output. A subtle point to note here is, if an input DataFrame has a column added by user (say label for some reason), then that column will not be present in the output. The reason is, the same column may not be present in other candidate sets so it is not clear about how to combine them. One possibility is to include label in output for all tuple id pairs, but set as NaN for the values not present. Currently py_entitymatching does not include such columns and addressing it will be part of future work. Args: blocker_output_list (list of DataFrames): The list of DataFrames that should be combined. l_prefix (string): The prefix given to the attributes from the ltable. r_prefix (string): The prefix given to the attributes from the rtable. verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (default value is False). Returns: A new DataFrame with the combined tuple pairs and other attributes from all the blocker lists. Raises: AssertionError: If `l_prefix` is not of type string. AssertionError: If `r_prefix` is not of type string. AssertionError: If the length of the input DataFrame list is 0. AssertionError: If `blocker_output_list` is not a list of DataFrames. AssertionError: If the ltables are different across the input list of DataFrames. AssertionError: If the rtables are different across the input list of DataFrames. AssertionError: If the `fk_ltable` values are different across the input list of DataFrames. AssertionError: If the `fk_rtable` values are different across the input list of DataFrames. Examples: >>> import py_entitymatching as em >>> ab = em.AttrEquivalenceBlocker() >>> C = ab.block_tables(A, B, 'zipcode', 'zipcode') >>> ob = em.OverlapBlocker() >>> D = ob.block_candset(C, 'address', 'address') >>> block_f = em.get_features_for_blocking(A, B) >>> rb = em.RuleBasedBlocker() >>> rule = ['address_address_lev(ltuple, rtuple) > 6'] >>> rb.add_rule(rule, block_f) >>> E = rb.block_tables(A, B) >>> F = em.combine_blocker_outputs_via_union([C, E]) """ # validate input parameters # The l_prefix is expected to be of type string py_entitymatching.utils.validation_helper.validate_object_type(l_prefix, six.string_types, 'l_prefix') # The r_prefix is expected to be of type string py_entitymatching.utils.validation_helper.validate_object_type(r_prefix, six.string_types, 'r_prefix') # We cannot combine empty DataFrame list if not len(blocker_output_list) > 0: logger.error('There no DataFrames to combine') raise AssertionError('There are no DataFrames to combine') # Validate the assumptions about the input tables. # # 1) All the input object must be DataFrames # # 2) All the input DataFrames must have the metadata as that of a # candidate set # # 3) All the input DataFrames must have the same fk_ltable and fk_rtable _validate_lr_tables(blocker_output_list) # # Get the ltable and rtable. We take it from the first DataFrame as all # the DataFrames contain the same ltables and rtables ltable = cm.get_ltable(blocker_output_list[0]) rtable = cm.get_rtable(blocker_output_list[0]) # # Get the fk_ltable and fk_rtable. We take it from the first DataFrame as # all the DataFrames contain the same ltables and rtables fk_ltable = cm.get_fk_ltable(blocker_output_list[0]) fk_rtable = cm.get_fk_rtable(blocker_output_list[0]) # Retrieve the keys for the ltable and rtables. l_key = cm.get_key(ltable) r_key = cm.get_key(rtable) # Check if the fk_ltable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_ltable.startswith(l_prefix) is False: logger.warning( 'Foreign key for ltable is not starting with the given prefix (' '%s)', l_prefix) # Check if the fk_rtable is starting with the given prefix, if not its # not an error. Just raise a warning. if fk_rtable.startswith(r_prefix) is False: logger.warning( 'Foreign key for rtable is not starting with the given prefix (' '%s)', r_prefix) # Initialize lists # # keep track of projected tuple pair ids tuple_pair_ids = [] # # keep track of output attributes from the left table l_output_attrs = [] # # keep track of output attributes from the right table r_output_attrs = [] # for each DataFrame in the given list, project out tuple pair ids, get the # attributes from the ltable and rtable for data_frame in blocker_output_list: # Project out the tuple pair ids. A tuple pair id is a fk_ltable, # fk_rtable pair projected_tuple_pair_ids = data_frame[[fk_ltable, fk_rtable]] # Update the list that tracks tuple pair ids tuple_pair_ids.append(projected_tuple_pair_ids) # Get the columns, which should be segregated into the attributes # from the ltable and table col_set = ( gh.list_diff(list(data_frame.columns), [fk_ltable, fk_rtable, cm.get_key(data_frame)])) # Segregate the columns as attributes from the ltable and rtable l_attrs, r_attrs = _lr_cols(col_set, l_prefix, r_prefix) # Update the l_output_attrs, r_output_attrs l_output_attrs.extend(l_attrs) # the reason we use extend because l_attrs a list r_output_attrs.extend(r_attrs) ch.log_info(logger, 'Concatenating the tuple pair ids across given ' 'blockers ...', verbose) # concatenate the tuple pair ids from the list of input DataFrames concatenated_tuple_pair_ids = pd.concat(tuple_pair_ids) ch.log_info(logger, 'Concatenating the tuple pair ids ... DONE', verbose) ch.log_info(logger, 'Deduplicating the tuple pair ids ...', verbose) # Deduplicate the DataFrame. Now the returned DataFrame will contain # unique tuple pair ids. # noinspection PyUnresolvedReferences deduplicated_tuple_pair_ids = concatenated_tuple_pair_ids.drop_duplicates() ch.log_info(logger, 'Deduplicating the tuple pair ids ... DONE', verbose) # Construct output table # # Get unique list of attributes across different tables l_output_attrs = gh.list_drop_duplicates(l_output_attrs) r_output_attrs = gh.list_drop_duplicates(r_output_attrs) # Reset the index that might have lingered from concatenation. deduplicated_tuple_pair_ids.reset_index(inplace=True, drop=True) # Add the output attribtues from the ltable and rtable. # NOTE: This approach may be inefficient as it probes the ltable, rtable # to get the attribute values. A better way would be to fill the # attribute values from the input list of DataFrames. This attribute values # could be harvested (at the expense of some space) while we iterate the # input blocker output list for the first time. # noinspection PyProtectedMember consolidated_data_frame = gh._add_output_attributes( deduplicated_tuple_pair_ids, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_prefix, r_prefix, validate=False) # Sort the DataFrame ordered by fk_ltable and fk_rtable. # The function "sort" will be depreciated in the newer versions of # pandas DataFrame, and it will replaced by 'sort_values' function. So we # will first try to use sort_values if this fails we will use sort. try: consolidated_data_frame.sort_values([fk_ltable, fk_rtable], inplace=True) except AttributeError: consolidated_data_frame.sort([fk_ltable, fk_rtable], inplace=True) # update the catalog for the consolidated DataFrame # First get a column name for the key key = ch.get_name_for_key(consolidated_data_frame.columns) # Second, add the column name as the key consolidated_data_frame = ch.add_key_column(consolidated_data_frame, key) # Third, reset the index to remove any out of order index values from # the sort. consolidated_data_frame.reset_index(inplace=True, drop=True) # Finally, set the properties for the consolidated DataFrame in the catalog cm.set_candset_properties(consolidated_data_frame, key, fk_ltable, fk_rtable, ltable, rtable) # Return the consolidated DataFrame return consolidated_data_frame
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors