def rename_col(df, old_col_name, new_col_name): new_df = df.rename(columns={old_col_name: new_col_name}) if cm.is_dfinfo_present(df): cm.init_properties(new_df) cm.copy_properties(df, new_df) if _is_table_or_candset(df): if not _is_table(df): key, fk_ltable, fk_rtable, ltable, rtable, l_key, r_key = \ cm.get_metadata_for_candset(df, logger, False) if key == old_col_name: cm.set_key(new_df, new_col_name) elif fk_ltable == old_col_name: cm.set_fk_ltable(new_df, new_col_name) elif fk_rtable == old_col_name: cm.set_fk_rtable(new_df, new_col_name) else: pass else: key = cm.get_key(df) if key == old_col_name: cm.set_key(new_df, new_col_name) return new_df
def test_blocker_combiner_valid_8(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B) C1.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C1.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C1, 'ltable_ID') cm.set_fk_rtable(C1, 'rtable_ID') C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B) C2.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C2.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C2, 'ltable_ID') cm.set_fk_rtable(C2, 'rtable_ID') C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_') C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B) C_exp.rename(columns={'l_ID':'ltable_ID'}, inplace=True) C_exp.rename(columns={'r_ID':'rtable_ID'}, inplace=True) cm.set_fk_ltable(C_exp, 'ltable_ID') cm.set_fk_rtable(C_exp, 'rtable_ID') # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True) # C_exp.reset_index(inplace=True, drop=True) # C_exp['_id'] = six.moves.range(0, len(C_exp)) # C_exp.drop('r_address', axis=1, inplace=True) if os.name != 'nt': self.assertEqual(C.equals(C_exp), True) p1 = cm.get_all_properties(C) p2 = cm.get_all_properties(C_exp) self.assertEqual(p1, p2)
def test_blocker_combiner_valid_8(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') C1 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_1.csv']), ltable=A, rtable=B) C1.rename(columns={'l_ID': 'ltable_ID'}, inplace=True) C1.rename(columns={'r_ID': 'rtable_ID'}, inplace=True) cm.set_fk_ltable(C1, 'ltable_ID') cm.set_fk_rtable(C1, 'rtable_ID') C2 = read_csv_metadata(os.sep.join([bc_datasets_path, 'C4_ex_2.csv']), ltable=A, rtable=B) C2.rename(columns={'l_ID': 'ltable_ID'}, inplace=True) C2.rename(columns={'r_ID': 'rtable_ID'}, inplace=True) cm.set_fk_ltable(C2, 'ltable_ID') cm.set_fk_rtable(C2, 'rtable_ID') C = combine_blocker_outputs_via_union([C1, C2], 'l_', 'r_') C_exp = read_csv_metadata(os.sep.join([bc_datasets_path, 'C_ex_4.csv']), ltable=A, rtable=B) C_exp.rename(columns={'l_ID': 'ltable_ID'}, inplace=True) C_exp.rename(columns={'r_ID': 'rtable_ID'}, inplace=True) cm.set_fk_ltable(C_exp, 'ltable_ID') cm.set_fk_rtable(C_exp, 'rtable_ID') # C_exp.sort_values(['l_ID', 'r_ID'], inplace=True) # C_exp.reset_index(inplace=True, drop=True) # C_exp['_id'] = six.moves.range(0, len(C_exp)) # C_exp.drop('r_address', axis=1, inplace=True) if os.name != 'nt': self.assertEqual(C.equals(C_exp), True) p1 = cm.get_all_properties(C) p2 = cm.get_all_properties(C_exp) self.assertEqual(p1, p2)
def test_set_fk_rtable_invalid_col(self): C = pd.read_csv(path_c) cm.set_fk_rtable(C, 'rtable_ID1')
def test_set_fk_rtable_invalid_df(self): cm.set_fk_rtable(None, 'rtable_ID')
def test_set_fk_rtable_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = pd.read_csv(path_c) cm.set_fk_rtable(C, 'rtable_ID') self.assertEqual(cm.get_fk_rtable(C), 'rtable_ID')
def read_csv_metadata(file_path, **kwargs): """ Reads a CSV (comma-separated values) file into a pandas DataFrame and update the catalog with the metadata. The CSV files typically contain data for the input tables or a candidate set. Specifically, this function first reads the CSV file from the given file path into a pandas DataFrame, by using pandas' in-built 'read_csv' method. Then, it updates the catalog with the metadata. There are three ways to update the metadata: (1) using a metadata file, (2) using the key-value parameters supplied in the function, and (3) using both metadata file and key-value parameters. To update the metadata in the catalog using the metadata file, the function will look for a file in the same directory with same file name but with a specific extension. This extension can be optionally given by the user (defaults to '.metadata'). If the metadata file is present, the function will read and update the catalog appropriately. If the metadata file is not present, the function will issue a warning that the metadata file is not present. The metadata information can also be given as parameters to the function (see description of arguments for more details). If given, the function will update the catalog with the given information. Further, the metadata can partly reside in the metdata file and partly as supplied parameters. The function will take a union of the two and update the catalog appropriately. If the same metadata is given in both the metadata file and the function, then the metadata in the function takes precedence over the metadata given in the file. Args: file_path(string): The CSV file path kwargs(dictionary): A Python dictionary containing key-value arguments. There are a few key-value pairs that are specific to read_csv_metadata and all the other key-value pairs are passed to pandas read_csv method Returns: A pandas DataFrame read from the input CSV file. Raises: AssertionError: If `file_path` is not of type string. AssertionError: If a file does not exist in the given `file_path`. Examples: *Example 1:* Read from CSV file and set metadata >>> A = em.read_csv_metadata('path_to_csv_file', key='id') >>> em.get_key(A) # 'id' *Example 2:* Read from CSV file (with metadata file in the same directory Let the metadata file contain the following contents: #key = id >>> A = em.read_csv_metadata('path_to_csv_file') >>> em.get_key(A) # 'id' See Also: :meth:`~py_entitymatching.to_csv_metadata` """ # Validate the input parameters. # # File path is expected to be of type string. if not isinstance(file_path, six.string_types): logger.error('Input file path is not of type string') raise AssertionError('Input file path is not of type string') # # Check if the given path is valid. if not os.path.exists(file_path): logger.error('File does not exist at path %s' % file_path) raise AssertionError('File does not exist at path %s' % file_path) # Check if the user has specified the metadata file's extension. extension = kwargs.pop('metadata_extn', None) # If the extension is not specified then set the extension to .metadata'. if extension is None: extension = '.metadata' # Format the extension to include a '.' in front if the user has not # given one. if not extension.startswith('.'): extension = '.' + extension # If the file is present, then update metadata from file. if _is_metadata_file_present(file_path, extension=extension): file_name, _ = os.path.splitext(file_path) file_name = ''.join([file_name, extension]) metadata, _ = _get_metadata_from_file(file_name) # Else issue a warning that the metadata file is not present else: logger.warning('Metadata file is not present in the given path; ' 'proceeding to read the csv file.') metadata = {} # Update the metadata with the key-value pairs given in the command. The # function _update_metadata_for_read_cmd takes care of updating the # metadata with only the key-value pairs specific to read_csv_metadata # method metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs) # Validate the metadata. _check_metadata_for_read_cmd(metadata) # Read the csv file using pandas read_csv method. data_frame = pd.read_csv(file_path, **kwargs) # Get the value for 'key' property and update the catalog. key = metadata.pop('key', None) if key is not None: cm.set_key(data_frame, key) fk_ltable = metadata.pop('fk_ltable', None) if fk_ltable is not None: cm.set_fk_ltable(data_frame, fk_ltable) fk_rtable = metadata.pop('fk_rtable', None) if fk_ltable is not None: cm.set_fk_rtable(data_frame, fk_rtable) # Update the catalog with other properties. for property_name, property_value in six.iteritems(metadata): cm.set_property(data_frame, property_name, property_value) if not cm.is_dfinfo_present(data_frame): cm.init_properties(data_frame) # Return the DataFrame return data_frame
def block_tables(self, ltable, rtable, l_overlap_attr, r_overlap_attr, rem_stop_words=False, q_val=None, word_level=True, overlap_size=1, l_output_attrs=None, r_output_attrs=None, l_output_prefix='ltable_', r_output_prefix='rtable_', allow_missing=False, verbose=False, show_progress=True, n_ltable_chunks=1, n_rtable_chunks=1): """ WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK. Blocks two tables based on the overlap of token sets of attribute values. Finds tuple pairs from left and right tables such that the overlap between (a) the set of tokens obtained by tokenizing the value of attribute l_overlap_attr of a tuple from the left table, and (b) the set of tokens obtained by tokenizing the value of attribute r_overlap_attr of a tuple from the right table, is above a certain threshold. Args: ltable (DataFrame): The left input table. rtable (DataFrame): The right input table. l_overlap_attr (string): The overlap attribute in left table. r_overlap_attr (string): The overlap attribute in right table. rem_stop_words (boolean): A flag to indicate whether stop words (e.g., a, an, the) should be removed from the token sets of the overlap attribute values (defaults to False). q_val (int): The value of q to use if the overlap attributes values are to be tokenized as qgrams (defaults to None). word_level (boolean): A flag to indicate whether the overlap attributes should be tokenized as words (i.e, using whitespace as delimiter) (defaults to True). overlap_size (int): The minimum number of tokens that must overlap (defaults to 1). l_output_attrs (list): A list of attribute names from the left table to be included in the output candidate set (defaults to None). r_output_attrs (list): A list of attribute names from the right table to be included in the output candidate set (defaults to None). l_output_prefix (string): The prefix to be used for the attribute names coming from the left table in the output candidate set (defaults to 'ltable\_'). r_output_prefix (string): The prefix to be used for the attribute names coming from the right table in the output candidate set (defaults to 'rtable\_'). allow_missing (boolean): A flag to indicate whether tuple pairs with missing value in at least one of the blocking attributes should be included in the output candidate set (defaults to False). If this flag is set to True, a tuple in ltable with missing value in the blocking attribute will be matched with every tuple in rtable and vice versa. verbose (boolean): A flag to indicate whether the debug information should be logged (defaults to False). show_progress (boolean): A flag to indicate whether progress should be displayed to the user (defaults to True). n_ltable_chunks (int): The number of partitions to split the left table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. n_rtable_chunks (int): The number of partitions to split the right table ( defaults to 1). If it is set to -1, then the number of partitions is set to the number of cores in the machine. Returns: A candidate set of tuple pairs that survived blocking (DataFrame). Raises: AssertionError: If `ltable` is not of type pandas DataFrame. AssertionError: If `rtable` is not of type pandas DataFrame. AssertionError: If `l_overlap_attr` is not of type string. AssertionError: If `r_overlap_attr` is not of type string. AssertionError: If `l_output_attrs` is not of type of list. AssertionError: If `r_output_attrs` is not of type of list. AssertionError: If the values in `l_output_attrs` is not of type string. AssertionError: If the values in `r_output_attrs` is not of type string. AssertionError: If `l_output_prefix` is not of type string. AssertionError: If `r_output_prefix` is not of type string. AssertionError: If `q_val` is not of type int. AssertionError: If `word_level` is not of type boolean. AssertionError: If `overlap_size` is not of type int. AssertionError: If `verbose` is not of type boolean. AssertionError: If `allow_missing` is not of type boolean. AssertionError: If `show_progress` is not of type boolean. AssertionError: If `n_ltable_chunks` is not of type int. AssertionError: If `n_rtable_chunks` is not of type int. AssertionError: If `l_overlap_attr` is not in the ltable columns. AssertionError: If `r_block_attr` is not in the rtable columns. AssertionError: If `l_output_attrs` are not in the ltable. AssertionError: If `r_output_attrs` are not in the rtable. SyntaxError: If `q_val` is set to a valid value and `word_level` is set to True. SyntaxError: If `q_val` is set to None and `word_level` is set to False. Examples: >>> from py_entitymatching.dask.dask_overlap_blocker import DaskOverlapBlocker >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID') >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID') >>> ob = DaskOverlapBlocker() # Use all cores # # Use word-level tokenizer >>> C1 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=True, overlap_size=1, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Use q-gram tokenizer >>> C2 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], word_level=False, q_val=2, n_ltable_chunks=-1, n_rtable_chunks=-1) # # Include all possible missing values >>> C3 = ob.block_tables(A, B, 'address', 'address', l_output_attrs=['name'], r_output_attrs=['name'], allow_missing=True, n_ltable_chunks=-1, n_rtable_chunks=-1) """ logger.warning( "WARNING THIS COMMAND IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN " "RISK.") # Input validations self.validate_types_params_tables(ltable, rtable, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, verbose, n_ltable_chunks, n_rtable_chunks) self.validate_types_other_params(l_overlap_attr, r_overlap_attr, rem_stop_words, q_val, word_level, overlap_size) self.validate_allow_missing(allow_missing) self.validate_show_progress(show_progress) self.validate_overlap_attrs(ltable, rtable, l_overlap_attr, r_overlap_attr) self.validate_output_attrs(ltable, rtable, l_output_attrs, r_output_attrs) self.validate_word_level_qval(word_level, q_val) log_info(logger, 'Required metadata: ltable key, rtable key', verbose) l_key, r_key = cm.get_keys_for_ltable_rtable(ltable, rtable, logger, verbose) # validate metadata cm._validate_metadata_for_table(ltable, l_key, 'ltable', logger, verbose) cm._validate_metadata_for_table(rtable, r_key, 'rtable', logger, verbose) # validate input table chunks validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks') validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks') validate_chunks(n_ltable_chunks) validate_chunks(n_rtable_chunks) if n_ltable_chunks == -1: n_ltable_chunks = multiprocessing.cpu_count() ltable_chunks = pd.np.array_split(ltable, n_ltable_chunks) # preprocess/tokenize ltable if word_level == True: tokenizer = WhitespaceTokenizer(return_set=True) else: tokenizer = QgramTokenizer(qval=q_val, return_set=True) preprocessed_tokenized_ltbl = [] # Construct DAG for preprocessing/tokenizing ltable chunks start_row_id = 0 for i in range(len(ltable_chunks)): result = delayed(self.process_tokenize_block_attr)(ltable_chunks[i][ l_overlap_attr], start_row_id, rem_stop_words, tokenizer) preprocessed_tokenized_ltbl.append(result) start_row_id += len(ltable_chunks[i]) preprocessed_tokenized_ltbl = delayed(wrap)(preprocessed_tokenized_ltbl) # Execute the DAG if show_progress: with ProgressBar(): logger.info('Preprocessing/tokenizing ltable') preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) else: preprocessed_tokenized_ltbl_vals = preprocessed_tokenized_ltbl.compute( scheduler="processes", num_workers=multiprocessing.cpu_count()) ltable_processed_dict = {} for i in range(len(preprocessed_tokenized_ltbl_vals)): ltable_processed_dict.update(preprocessed_tokenized_ltbl_vals[i]) # build inverted index inverted_index = self.build_inverted_index(ltable_processed_dict) if n_rtable_chunks == -1: n_rtable_chunks = multiprocessing.cpu_count() rtable_chunks = pd.np.array_split(rtable, n_rtable_chunks) # Construct the DAG for probing probe_result = [] start_row_id = 0 for i in range(len(rtable_chunks)): result = delayed(self.probe)(rtable_chunks[i][r_overlap_attr], inverted_index, start_row_id, rem_stop_words, tokenizer, overlap_size) probe_result.append(result) start_row_id += len(rtable_chunks[i]) probe_result = delayed(wrap)(probe_result) # Execute the DAG for probing if show_progress: with ProgressBar(): logger.info('Probing using rtable') probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) else: probe_result = probe_result.compute(scheduler="processes", num_workers=multiprocessing.cpu_count()) # construct a minimal dataframe that can be used to add more attributes flat_list = [item for sublist in probe_result for item in sublist] tmp = pd.DataFrame(flat_list, columns=['fk_ltable_rid', 'fk_rtable_rid']) fk_ltable = ltable.iloc[tmp.fk_ltable_rid][l_key].values fk_rtable = rtable.iloc[tmp.fk_rtable_rid][r_key].values id_vals = list(range(len(flat_list))) candset = pd.DataFrame.from_dict( {'_id': id_vals, l_output_prefix+l_key: fk_ltable, r_output_prefix+r_key: fk_rtable}) # set the properties for the candidate set cm.set_key(candset, '_id') cm.set_fk_ltable(candset, 'ltable_'+l_key) cm.set_fk_rtable(candset, 'rtable_'+r_key) cm.set_ltable(candset, ltable) cm.set_rtable(candset, rtable) ret_candset = gh.add_output_attributes(candset, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs, l_output_prefix=l_output_prefix, r_output_prefix=r_output_prefix, validate=False) # handle missing values if allow_missing: missing_value_pairs = get_pairs_with_missing_value(ltable, rtable, l_key, r_key, l_overlap_attr, r_overlap_attr, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, False, False) missing_value_pairs.insert(0, '_id', range(len(ret_candset), len(ret_candset)+len(missing_value_pairs))) if len(missing_value_pairs) > 0: ret_candset = pd.concat([ret_candset, missing_value_pairs], ignore_index=True, sort=False) cm.set_key(ret_candset, '_id') cm.set_fk_ltable(ret_candset, 'ltable_' + l_key) cm.set_fk_rtable(ret_candset, 'rtable_' + r_key) cm.set_ltable(ret_candset, ltable) cm.set_rtable(ret_candset, rtable) # Return the final candidate set to user. return ret_candset