def add_output_attributes(candset, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', validate=True, copy_props=True, delete_from_catalog=True, verbose=False): if not isinstance(candset, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = cm.get_metadata_for_candset(candset, logger, verbose) if validate: cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) index_values = candset.index df = _add_output_attributes(candset, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, validate=False) df.set_index(index_values, inplace=True) if copy_props: cm.init_properties(df) cm.copy_properties(candset, df) if delete_from_catalog: cm.del_all_properties(candset) return df
def _post_process_labelled_table(input_table, labeled_table, col_name): """ This function post processes the labeled table and updates the catalog. Specifically, this function validates that the label column contain only 0 and 1's, and finally copies the properties from the input table to the output table. """ # Cast the label values to int as initially they will be strings when it # comes from the GUI labeled_table[col_name] = labeled_table[col_name].astype(int) # Check if the table contains only 0s and 1s label_value_with_1 = labeled_table[col_name] == 1 label_value_with_0 = labeled_table[col_name] == 0 sum_of_labels = sum(label_value_with_1 | label_value_with_0) # If they contain column values other than 0 and 1, raise an error if not sum_of_labels == len(labeled_table): logger.error('The label column contains values other than 0 and 1') raise AssertionError( 'The label column contains values other than 0 and 1') # Copy the properties from the input table to label table. # Note: Here we dont have to check for the integrity of 'key' because the # key column is not tampered from the input table. cm.init_properties(labeled_table) cm.copy_properties(input_table, labeled_table) # Return the label table return labeled_table
def rename_col(df, old_col_name, new_col_name): new_df = df.rename(columns={old_col_name: new_col_name}) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(df, logger, False) if key == old_col_name: cm.set_key(new_df, new_col_name) elif fk_ltable == old_col_name: cm.set_fk_ltable(new_df, new_col_name) elif fk_rtable == old_col_name: cm.set_fk_rtable(new_df, new_col_name) else: pass else: key = cm.get_key(df) if key == old_col_name: cm.set_key(new_df, new_col_name) return new_df
def test_set_properties_valid_1(self): A = read_csv_metadata(path_a) p = cm.get_all_properties(A) B = pd.read_csv(path_b) cm.init_properties(B) cm.set_properties(B,p) self.assertEqual(cm.get_all_properties(B)==p, True)
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(table, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def filter_rows(df, condn): new_df = df.query(condn) # update metadata if cm.is_dfinfo_present(df): if _is_table_or_candset(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def mutate_col(df, **kwargs): new_df = df.assign(**kwargs) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) # if _is_table_or_candset(df): # key = cm.get_key(df) # if key == new_col_name: # cm.set_key(new_df, new_col_name) return new_df
def drop_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) col_list = gh.list_diff(col_list, [key, fk_ltable, fk_rtable]) col_list = gh.list_drop_duplicates(col_list) else: key = cm.get_key(df) col_list = gh.list_diff(col_list, [key]) col_list = gh.list_drop_duplicates(col_list) new_df = df.drop(col_list, axis=1) cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def preserve_metadata(df, new_df): if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) if not ch.check_attrs_present(new_df, [key, fk_ltable, fk_rtable]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df else: key = cm.get_key(df) if not ch.check_attrs_present(new_df, [key]): logger.warning('Not setting the metadata as some attrs ' 'are not present') return new_df cm.init_properties(new_df) cm.copy_properties(df, new_df) return new_df
def project_cols(df, col_list): if not isinstance(col_list, list): col_list = [col_list] if cm.is_dfinfo_present(df): if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key\ = cm.get_metadata_for_candset(df, logger, False) updated_col_list = [key, fk_ltable, fk_rtable] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) else: key = cm.get_key(df) updated_col_list = [key] updated_col_list.extend(col_list) col_list = gh.list_drop_duplicates(updated_col_list) new_df = df[col_list] cm.init_properties(new_df) cm.copy_properties(df, new_df) else: new_df = df[col_list] return new_df
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) n_procs = get_num_procs(n_jobs, len(candset)) c_splits = pd.np.array_split(candset, n_procs) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = Parallel(n_jobs=n_procs)(delayed(get_feature_vals_by_cand_split)(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], show_progress and i == len( c_splits) - 1) for i in range(len(c_splits))) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. if not isinstance(candset, pd.DataFrame): logger.error('Input cand.set is not of type dataframe') raise AssertionError('Input cand.set is not of type dataframe') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) if show_progress: prog_bar = pyprind.ProgBar(len(candset)) # # Apply feature functions feat_vals = [] ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) l_dict = {} r_dict = {} for row in candset.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def split_train_test(labeled_data, train_proportion=0.5, random_state=None, verbose=True): """ This function splits the input data into train and test. Specifically, this function is just a wrapper of scikit-learn's train_test_split function. This function also takes care of copying the metadata from the input table to train and test splits. Args: labeled_data (DataFrame): The input pandas DataFrame that needs to be split into train and test. train_proportion (float): A number between 0 and 1, indicating the proportion of tuples that should be included in the train split ( defaults to 0.5). random_state (object): A number of random number object (as in scikit-learn). verbose (boolean): A flag to indicate whether the debug information should be displayed. Returns: A Python dictionary containing two keys - train and test. The value for the key 'train' is a pandas DataFrame containing tuples allocated from the input table based on train_proportion. Similarly, the value for the key 'test' is a pandas DataFrame containing tuples for evaluation. This function sets the output DataFrames (train, test) properties same as the input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data or the feature vectors that should be split >>> train_test = em.split_train_test(G, train_proportion=0.5) >>> train, test = train_test['train'], train_test['test'] """ # Validate input parameters # # We expected labeled data to be of type pandas DataFrame if not isinstance(labeled_data, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( labeled_data, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(labeled_data, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) num_rows = len(labeled_data) # We expect the train proportion to be between 0 and 1. assert train_proportion >= 0 and train_proportion <= 1, \ " Train proportion is expected to be between 0 and 1" # We expect the number of rows in the table to be non-empty assert num_rows > 0, 'The input table is empty' # Explicitly get the train and test size in terms of tuples (based on the # given proportion) train_size = int(math.floor(num_rows * train_proportion)) test_size = int(num_rows - train_size) # Use sk-learn to split the data idx_values = pd.np.array(labeled_data.index.values) idx_train, idx_test = ms.train_test_split(idx_values, test_size=test_size, train_size=train_size, random_state=random_state) # Construct output tables. label_train = labeled_data.ix[idx_train] label_test = labeled_data.ix[idx_test] # Update catalog cm.init_properties(label_train) cm.copy_properties(labeled_data, label_train) cm.init_properties(label_test) cm.copy_properties(labeled_data, label_test) # Return output tables result = OrderedDict() result['train'] = label_train result['test'] = label_test # Finally, return the dictionary. return result
def impute_table(table, exclude_attrs=None, missing_val='NaN', strategy='mean', axis=0, val_all_nans=0, verbose=True): """ Impute table containing missing values. Args: table (DataFrame): DataFrame which values should be imputed. exclude_attrs (List) : list of attribute names to be excluded from imputing (defaults to None). missing_val (string or int): The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value 'NaN' (defaults to 'NaN'). strategy (string): String that specifies on how to impute values. Valid strings: 'mean', 'median', 'most_frequent' (defaults to 'mean'). axis (int): axis=1 along rows, and axis=0 along columns (defaults to 0). val_all_nans (float): Value to fill in if all the values in the column are NaN. Returns: Imputed DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. Examples: >>> import py_entitymatching as em >>> # H is the feature vector which should be imputed. Specifically, impute the missing values >>> # in each column, with the mean of that column >>> H = em.impute_table(H, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], strategy='mean') """ # Validate input paramaters # # We expect the input table to be of type pandas DataFrame if not isinstance(table, pd.DataFrame): logger.error('Input table is not of type DataFrame') raise AssertionError('Input table is not of type DataFrame') ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) fv_columns = table.columns if exclude_attrs == None: feature_names = fv_columns else: # Check if the exclude attributes are present in the input table if not ch.check_attrs_present(table, exclude_attrs): logger.error('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') raise AssertionError('The attributes mentioned in exclude_attrs ' 'is not present ' 'in the input table') # We expect exclude attributes to be of type list. If not convert it into # a list. if not isinstance(exclude_attrs, list): exclude_attrs = [exclude_attrs] # Drop the duplicates from the exclude attributes exclude_attrs = gh.list_drop_duplicates(exclude_attrs) cols = [c not in exclude_attrs for c in fv_columns] feature_names = fv_columns[cols] # print feature_names table_copy = table.copy() projected_table = table_copy[feature_names] projected_table_values = projected_table.values imp = Imputer(missing_values=missing_val, strategy=strategy, axis=axis) imp.fit(projected_table_values) imp.statistics_[pd.np.isnan(imp.statistics_)] = val_all_nans projected_table_values = imp.transform(projected_table_values) table_copy[feature_names] = projected_table_values # Update catalog cm.init_properties(table_copy) cm.copy_properties(table, table_copy) return table_copy
def dask_extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). n_chunks (int): The number of partitions to split the candidate set. If it is set to -1, the number of partitions will be set to the number of cores in the machine. Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. AssertionError: If `n_chunks` is not of type int. Examples: >>> import py_entitymatching as em >>> from py_entitymatching.dask.dask_extract_features import dask_extract_feature_vecs >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = dask_extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK." ) # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # # Set index for convenience l_df = ltable.set_index(l_key, drop=False) r_df = rtable.set_index(r_key, drop=False) # # Apply feature functions ch.log_info(logger, 'Applying feature functions', verbose) col_names = list(candset.columns) fk_ltable_idx = col_names.index(fk_ltable) fk_rtable_idx = col_names.index(fk_rtable) validate_object_type(n_chunks, int, 'Parameter n_chunks') validate_chunks(n_chunks) n_chunks = get_num_partitions(n_chunks, len(candset)) c_splits = np.array_split(candset, n_chunks) pickled_obj = cloudpickle.dumps(feature_table) feat_vals_by_splits = [] for i in range(len(c_splits)): partial_result = delayed(get_feature_vals_by_cand_split)( pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, c_splits[i], False) feat_vals_by_splits.append(partial_result) feat_vals_by_splits = delayed(wrap)(feat_vals_by_splits) if show_progress: with ProgressBar(): feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) else: feat_vals_by_splits = feat_vals_by_splits.compute( scheduler="processes", num_workers=get_num_cores()) feat_vals = sum(feat_vals_by_splits, []) # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def test_init_properties_invalid_df(self): cm.init_properties(None)
def test_init_properties_valid(self): # cm.del_catalog() A = pd.read_csv(path_a) cm.init_properties(A) self.assertEqual(cm.is_dfinfo_present(A), True)
def extract_feature_vecs(candset, attrs_before=None, feature_table=None, attrs_after=None, verbose=False, show_progress=True, n_jobs=1, FeatureExtractor=ParallelFeatureExtractor): """ This function extracts feature vectors from a DataFrame (typically a labeled candidate set). Specifically, this function uses feature table, ltable and rtable (that is present in the `candset`'s metadata) to extract feature vectors. Args: candset (DataFrame): The input candidate set for which the features vectors should be extracted. attrs_before (list): The list of attributes from the input candset, that should be added before the feature vectors (defaults to None). feature_table (DataFrame): A DataFrame containing a list of features that should be used to compute the feature vectors ( defaults to None). attrs_after (list): The list of attributes from the input candset that should be added after the feature vectors (defaults to None). verbose (boolean): A flag to indicate whether the debug information should be displayed (defaults to False). show_progress (boolean): A flag to indicate whether the progress of extracting feature vectors must be displayed (defaults to True). Returns: A pandas DataFrame containing feature vectors. The DataFrame will have metadata ltable and rtable, pointing to the same ltable and rtable as the input candset. Also, the output DataFrame will have three columns: key, foreign key ltable, foreign key rtable copied from input candset to the output DataFrame. These three columns precede the columns mentioned in `attrs_before`. Raises: AssertionError: If `candset` is not of type pandas DataFrame. AssertionError: If `attrs_before` has attributes that are not present in the input candset. AssertionError: If `attrs_after` has attribtues that are not present in the input candset. AssertionError: If `feature_table` is set to None. Examples: >>> import py_entitymatching as em >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> match_f = em.get_features_for_matching(A, B) >>> # G is the labeled dataframe which should be converted into feature vectors >>> H = em.extract_feature_vecs(G, features=match_f, attrs_before=['title'], attrs_after=['gold_labels']) """ # (Matt) Stage 1: Input validation # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(candset, pd.DataFrame, error_prefix='Input cand.set') # # We expect the FeatureExtractor class to be of type BaseFeatureExtractor validate_subclass(FeatureExtractor, BaseFeatureExtractor, error_prefix='Input FeatureExtractor') # (Matt) The two blocks below are making sure that attributes that are to be appended # to this function's output do in fact exist in the input DataFrame # # If the attrs_before is given, Check if the attrs_before are present in # the input candset if attrs_before != None: if not ch.check_attrs_present(candset, attrs_before): logger.error( 'The attributes mentioned in attrs_before is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_before is not present ' 'in the input table') # # If the attrs_after is given, Check if the attrs_after are present in # the input candset if attrs_after != None: if not ch.check_attrs_present(candset, attrs_after): logger.error( 'The attributes mentioned in attrs_after is not present ' 'in the input table') raise AssertionError( 'The attributes mentioned in attrs_after is not present ' 'in the input table') # (Matt) Why not make sure that this is a DataFrame instead of just nonempty? # We expect the feature table to be a valid object if feature_table is None: logger.error('Feature table cannot be null') raise AssertionError('The feature table cannot be null') # Do metadata checking # # Mention what metadata is required to the user ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # (Matt) ch ~ catalog helper # # Get metadata ch.log_info(logger, 'Getting metadata from catalog', verbose) # (Matt) cm ~ catalog manager key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( candset, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(candset, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Extract features # id_list = [(row[fk_ltable], row[fk_rtable]) for i, row in # candset.iterrows()] # id_list = [tuple(tup) for tup in candset[[fk_ltable, fk_rtable]].values] # (Matt) ParallelFeatureExtractor implementation starts here # # Apply feature functions feature_extractor = FeatureExtractor( feature_table, n_jobs=n_jobs, verbose=verbose, show_progress=show_progress ) feat_vals = feature_extractor.extract_from(candset) # (Matt) ParallelFeatureExtractor implementation ends here; the rest is formatting # Construct output table feature_vectors = pd.DataFrame(feat_vals, index=candset.index.values) # # Rearrange the feature names in the input feature table order feature_names = list(feature_table['feature_name']) feature_vectors = feature_vectors[feature_names] ch.log_info(logger, 'Constructing output table', verbose) # print(feature_vectors) # # Insert attrs_before if attrs_before: if not isinstance(attrs_before, list): attrs_before = [attrs_before] attrs_before = gh.list_diff(attrs_before, [key, fk_ltable, fk_rtable]) attrs_before.reverse() for a in attrs_before: feature_vectors.insert(0, a, candset[a]) # # Insert keys feature_vectors.insert(0, fk_rtable, candset[fk_rtable]) feature_vectors.insert(0, fk_ltable, candset[fk_ltable]) feature_vectors.insert(0, key, candset[key]) # # insert attrs after if attrs_after: if not isinstance(attrs_after, list): attrs_after = [attrs_after] attrs_after = gh.list_diff(attrs_after, [key, fk_ltable, fk_rtable]) attrs_after.reverse() col_pos = len(feature_vectors.columns) for a in attrs_after: feature_vectors.insert(col_pos, a, candset[a]) col_pos += 1 # Reset the index # feature_vectors.reset_index(inplace=True, drop=True) # # Update the catalog cm.init_properties(feature_vectors) cm.copy_properties(candset, feature_vectors) # Finally, return the feature vectors return feature_vectors
def read_csv_metadata(file_path, **kwargs): """ Reads a CSV (comma-separated values) file into a pandas DataFrame and update the catalog with the metadata. The CSV files typically contain data for the input tables or a candidate set. Specifically, this function first reads the CSV file from the given file path into a pandas DataFrame, by using pandas' in-built 'read_csv' method. Then, it updates the catalog with the metadata. There are three ways to update the metadata: (1) using a metadata file, (2) using the key-value parameters supplied in the function, and (3) using both metadata file and key-value parameters. To update the metadata in the catalog using the metadata file, the function will look for a file in the same directory with same file name but with a specific extension. This extension can be optionally given by the user (defaults to '.metadata'). If the metadata file is present, the function will read and update the catalog appropriately. If the metadata file is not present, the function will issue a warning that the metadata file is not present. The metadata information can also be given as parameters to the function (see description of arguments for more details). If given, the function will update the catalog with the given information. Further, the metadata can partly reside in the metdata file and partly as supplied parameters. The function will take a union of the two and update the catalog appropriately. If the same metadata is given in both the metadata file and the function, then the metadata in the function takes precedence over the metadata given in the file. Args: file_path(string): The CSV file path kwargs(dictionary): A Python dictionary containing key-value arguments. There are a few key-value pairs that are specific to read_csv_metadata and all the other key-value pairs are passed to pandas read_csv method Returns: A pandas DataFrame read from the input CSV file. Raises: AssertionError: If `file_path` is not of type string. AssertionError: If a file does not exist in the given `file_path`. Examples: *Example 1:* Read from CSV file and set metadata >>> A = em.read_csv_metadata('path_to_csv_file', key='id') >>> em.get_key(A) # 'id' *Example 2:* Read from CSV file (with metadata file in the same directory Let the metadata file contain the following contents: #key = id >>> A = em.read_csv_metadata('path_to_csv_file') >>> em.get_key(A) # 'id' See Also: :meth:`~py_entitymatching.to_csv_metadata` """ # Validate the input parameters. # # File path is expected to be of type string. if not isinstance(file_path, six.string_types): logger.error('Input file path is not of type string') raise AssertionError('Input file path is not of type string') # # Check if the given path is valid. if not os.path.exists(file_path): logger.error('File does not exist at path %s' % file_path) raise AssertionError('File does not exist at path %s' % file_path) # Check if the user has specified the metadata file's extension. extension = kwargs.pop('metadata_extn', None) # If the extension is not specified then set the extension to .metadata'. if extension is None: extension = '.metadata' # Format the extension to include a '.' in front if the user has not # given one. if not extension.startswith('.'): extension = '.' + extension # If the file is present, then update metadata from file. if _is_metadata_file_present(file_path, extension=extension): file_name, _ = os.path.splitext(file_path) file_name = ''.join([file_name, extension]) metadata, _ = _get_metadata_from_file(file_name) # Else issue a warning that the metadata file is not present else: logger.warning('Metadata file is not present in the given path; ' 'proceeding to read the csv file.') metadata = {} # Update the metadata with the key-value pairs given in the command. The # function _update_metadata_for_read_cmd takes care of updating the # metadata with only the key-value pairs specific to read_csv_metadata # method metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs) # Validate the metadata. _check_metadata_for_read_cmd(metadata) # Read the csv file using pandas read_csv method. data_frame = pd.read_csv(file_path, **kwargs) # Get the value for 'key' property and update the catalog. key = metadata.pop('key', None) if key is not None: cm.set_key(data_frame, key) fk_ltable = metadata.pop('fk_ltable', None) if fk_ltable is not None: cm.set_fk_ltable(data_frame, fk_ltable) fk_rtable = metadata.pop('fk_rtable', None) if fk_ltable is not None: cm.set_fk_rtable(data_frame, fk_rtable) # Update the catalog with other properties. for property_name, property_value in six.iteritems(metadata): cm.set_property(data_frame, property_name, property_value) if not cm.is_dfinfo_present(data_frame): cm.init_properties(data_frame) # Return the DataFrame return data_frame
def get_false_negatives_as_df(table, eval_summary, verbose=False): """ Select only the false negatives from the input table and return as a DataFrame based on the evaluation results. Args: table (DataFrame): The input table (pandas DataFrame) that was used for evaluation. eval_summary (dictionary): A Python dictionary containing evaluation results, typically from 'eval_matches' command. Returns: A pandas DataFrame containing only the false negatives from the input table. Further, this function sets the output DataFrame's properties same as input DataFrame. Examples: >>> import py_entitymatching as em >>> # G is the labeled data used for development purposes, match_f is the feature table >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels') >>> dt = em.DTMatcher() >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels') >>> pred_table = dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], append=True, target_attr='predicted_labels') >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') >>> false_neg_df = em.get_false_negatives_as_df(H, eval_summary) """ # Validate input parameters # # We expect the input candset to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame, error_prefix='Input cand.set') # Do metadata checking # # Mention what metadata is required to the user ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ' 'ltable, rtable, ltable key, rtable key', verbose) # # Get metadata ch.log_info(logger, 'Getting metadata from the catalog', verbose) key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset( table, logger, verbose) # # Validate metadata ch.log_info(logger, 'Validating metadata', verbose) cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) data_frame = _get_dataframe(table, eval_summary['false_neg_ls']) # # Update catalog ch.log_info(logger, 'Updating catalog', verbose) cm.init_properties(data_frame) cm.copy_properties(table, data_frame) # # Update catalog ch.log_info(logger, 'Returning the dataframe', verbose) return data_frame
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info( logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table
def sample_table(table, sample_size, replace=False, verbose=False): """ Samples a candidate set of tuple pairs (for labeling purposes). This function samples a DataFrame, typically used for labeling purposes. This function expects the input DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable). Specifically, this function creates a copy of the input DataFrame, samples the data using uniform random sampling (uses 'random' function from numpy to sample) and returns the sampled DataFrame. Further, also copies the properties from the input DataFrame to the output DataFrame. Args: table (DataFrame): The input DataFrame to be sampled. Specifically, a DataFrame containing the metadata of a candidate set (such as key, fk_ltable, fk_rtable, ltable, rtable) in the catalog. sample_size (int): The number of samples to be picked from the input DataFrame. replace (boolean): A flag to indicate whether sampling should be done with replacement or not (defaults to False). verbose (boolean): A flag to indicate whether more detailed information about the execution steps should be printed out (defaults to False). Returns: A new DataFrame with 'sample_size' number of rows. Further, this function sets the output DataFrame's properties same as input DataFrame. Raises: AssertionError: If `table` is not of type pandas DataFrame. AssertionError: If the size of `table` is 0. AssertionError: If the `sample_size` is greater than the input DataFrame size. Examples: >>> import py_entitymatching as em >>> S = em.sample_table(C, sample_size=450) # C is the candidate set to be sampled from. Note: As mentioned in the above description, the output DataFrame is updated (in the catalog) with the properties from the input DataFrame. A subtle point to note here is, when the replace flag is set to True, then the output DataFrame can contain duplicate keys. In that case, this function will not set the key and it is up to the user to fix it after the function returns. """ # Validate input parameters. # # The input DataFrame is expected to be of type pandas DataFrame. validate_object_type(table, pd.DataFrame) # # There should at least not-zero rows to sample from if len(table) == 0: logger.error('Size of the input table is 0') raise AssertionError('Size of the input table is 0') # # The sample size should be less than or equal to the number of rows in # the input DataFrame if len(table) < sample_size: logger.error('Sample size is larger than the input table size') raise AssertionError('Sample size is larger than the input table size') # Now, validate the metadata for the input DataFrame as we have to copy # these properties to the output DataFrame # # First, display what metadata is required for this function ch.log_info(logger, 'Required metadata: cand.set key, fk ltable, ' 'fk rtable, ltable, rtable, ltable key, rtable key', verbose) # # Second, get the metadata key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(table, logger, verbose) # # Third, validate the metadata cm._validate_metadata_for_candset(table, key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key, logger, verbose) # Get the sample set for the output table sample_indices = pd.np.random.choice(len(table), sample_size, replace=replace) # Sort the indices ordered by index value sample_indices = sorted(sample_indices) sampled_table = table.iloc[list(sample_indices)] # Copy the properties cm.init_properties(sampled_table) # # If the replace is set to True, then we should check for the validity # of key before setting it if replace: properties = cm.get_all_properties(table) for property_name, property_value in six.iteritems(properties): if property_name == 'key': # Check for the validity of key before setting it cm.set_key(sampled_table, property_value) else: # Copy the other properties as is cm.set_property(sampled_table, property_name, property_value) else: cm.copy_properties(table, sampled_table) # Return the sampled table return sampled_table