def test_validate_and_set_fk_rtable_err_case_2(self): C = pd.read_csv(path_c) p = os.sep.join([catalog_datasets_path, 'A_inv_fk.csv']) A = pd.read_csv(p) status = cm.validate_and_set_fk_rtable(C, 'ltable_ID', A, 'ID') self.assertEqual(status, False) self.assertEqual(cm.is_dfinfo_present(C), False)
def test_copy_properties_valid_1(self): A = read_csv_metadata(path_a) A1 = pd.read_csv(path_a) cm.copy_properties(A, A1) self.assertEqual(cm.is_dfinfo_present(A1), True) p = cm.get_all_properties(A) p1 = cm.get_all_properties(A1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(A1), cm.get_key(A))
def test_copy_properties_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) C1 = pd.read_csv(path_c) cm.copy_properties(C, C1) self.assertEqual(cm.is_dfinfo_present(C1), True) p = cm.get_all_properties(C1) p1 = cm.get_all_properties(C1) self.assertEqual(p, p1) self.assertEqual(cm.get_key(C1), cm.get_key(C)) self.assertEqual(cm.get_ltable(C1).equals(A), True) self.assertEqual(cm.get_rtable(C1).equals(B), True) self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C)) self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
def _write_metadata(data_frame, file_path): """ Write metadata contents to disk. """ # Initialize a metadata dictionary to store the metadata. metadata_dict = collections.OrderedDict() # Get all the properties for the input data frame if cm.is_dfinfo_present(data_frame) is True: properties = cm.get_all_properties(data_frame) else: # If the data_frame is not in the catalog, then return immedidately. return False # If the properties are present in the catalog, then write properties to # disk if len(properties) > 0: for property_name, property_value in six.iteritems(properties): # If the property value is not of type string, then just write it # as 'POINTER'. This will be useful while writing the candidate # sets to disk. The candidate set will have properties such as # ltable and rtable which are DataFrames. We do not have a simple # way to write them to disk and link them back the candidate set # while reading back from disk. So to get around this problem we # will use 'POINTER' as the special value to indicate objects # other than strings. if isinstance(property_value, six.string_types) is False: metadata_dict[property_name] = 'POINTER' else: metadata_dict[property_name] = property_value # Write the properties to a file in disk. The file will one property # per line. We follow a special syntax to write the properties. The # syntax is: # #property_name=property_value with open(file_path, 'w') as file_handler: for property_name, property_value in six.iteritems(metadata_dict): file_handler.write('#%s=%s\n' % (property_name, property_value)) return True
def test_is_dfinfo_present_invalid(self): cm.is_dfinfo_present(None)
def test_valid_path_wi_invalidmetadata_replace_key(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv']) IM = read_csv_metadata(p, key='ID') self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.has_property(IM, 'key'), True)
def test_valid_path_wo_metadata(self): cm.del_catalog() B = read_csv_metadata(path_b) pd_B = pd.read_csv(path_b) self.assertEqual(B.equals(pd_B), True) self.assertEqual(cm.is_dfinfo_present(B), True)
def test_del_all_properties_valid_1(self): A = read_csv_metadata(path_a) cm.del_all_properties(A) self.assertEqual(cm.is_dfinfo_present(A), False)
def test_valid_path_wi_metadata_unknownprop(self): cm.del_catalog() p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv']) IM = read_csv_metadata(p) self.assertEqual(cm.is_dfinfo_present(IM), True) self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
def test_validpath_metadata_set_to_none_1(self): cm.del_catalog() del_files_in_dir(sndbx_path) A = read_csv_metadata(path_a, key=None) self.assertEqual(cm.is_dfinfo_present(A), True) cm.get_key(A)
def read_csv_metadata(file_path, **kwargs): """ Read CSV (comma-separated) file into DataFrame, and update the catalog with the metadata read from the same file name with an extension specified by the user (with the default value set to '.metadata') or the metadata given as key-value arguments. Reads the CSV file from the given file path into a pandas DataFrame. This function uses 'read_csv' method from pandas to read the CSV file into a pandas DataFrame. Further it looks for a file with same file name but with a specific extension. This extension can be given by the user, with the default value being '.metadata'. If the metadata file is present, the function with read and update the catalog. If the metadata file is not present, the function will issue a warning that the metadata file is not present and will read the CSV file into a pandas DataFrame. The metadata information can also be given as parameters to the function (see decription of arguments for more details). If given, the function will update the catalog with the given information. Further, the metadata given in the function takes precendence over the metadata given in the file. Args: file_path (string): CSV file path. kwargs (dict): A python dictionary containing key-value arguments. There are a few key-value pairs that are specific to read_csv_metadata and all the other key-value pairs are passed to pandas read_csv method. The keys that are specific to read_csv_metadata are: (1) metadata_extn, (2) key, (3) fk_ltable, (4) fk_rtable, (5) ltable, and (6) rtable. Here the metadata_extn is the expected metadata extension (with the default value set to '.metadata'), and all the others are metadata related to the DataFrame read from the CSV file. Returns: A pandas DataFrame read from the given CSV file. Raises: AssertionError: If the input file path is not of type string. AssertionError: If a file does not exist in the given file path. """ # Validate the input parameters. # # File path is expected to be of type string. if not isinstance(file_path, six.string_types): logger.error('Input file path is not of type string') raise AssertionError('Input file path is not of type string') # # Check if the given path is valid. if not os.path.exists(file_path): logger.error('File does not exist at path %s', file_path) raise AssertionError('File does not exist at path %s', file_path) # Check if the user has specified the metadata file's extension. extension = kwargs.pop('metadata_extn', None) # If the extension is not specified then set the extension to .metadata'. if extension is None: extension = '.metadata' # Format the extension to include a '.' in front if the user has not # given one. if not extension.startswith('.'): extension = '.' + extension # If the file is present, then update metadata from file. if _is_metadata_file_present(file_path, extension=extension): file_name, _ = os.path.splitext(file_path) file_name = ''.join([file_name, extension]) metadata, _ = _get_metadata_from_file(file_name) # Else issue a warning that the metadata file is not present else: logger.warning('Metadata file is not present in the given path; ' 'proceeding to read the csv file.') metadata = {} # Update the metadata with the key-value pairs given in the command. The # function _update_metadata_for_read_cmd takes care of updating the # metadata with only the key-value pairs specific to read_csv_metadata # method metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs) # Validate the metadata. _check_metadata_for_read_cmd(metadata) # Read the csv file using pandas read_csv method. data_frame = pd.read_csv(file_path, **kwargs) # Get the value for 'key' property and update the catalog. key = metadata.pop('key', None) if key is not None: cm.set_key(data_frame, key) # Update the catalog with other properties. for property_name, property_value in six.iteritems(metadata): cm.set_property(data_frame, property_name, property_value) if not cm.is_dfinfo_present(data_frame): cm.init_properties(data_frame) # Return the DataFrame return data_frame
def test_init_properties_valid(self): # cm.del_catalog() A = pd.read_csv(path_a) cm.init_properties(A) self.assertEqual(cm.is_dfinfo_present(A), True)
def test_del_all_properties_valid_2(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b) C = read_csv_metadata(path_c, ltable=A, rtable=B) cm.del_all_properties(C) self.assertEqual(cm.is_dfinfo_present(C), False)
def test_is_dfinfo_present_valid_1(self): A = read_csv_metadata(path_a) status = cm.is_dfinfo_present(A) self.assertEqual(status, True)
def test_is_dfinfo_present_valid_2(self): A = pd.read_csv(path_a) status = cm.is_dfinfo_present(A) self.assertEqual(status, False)
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'): """ Save the DataFrame to disk along with the metadata. This function saves the DataFrame to disk along with the metadata from tha catalog. Specifically, this function saves the DataFrame in the given file_path, and saves the metadata in the same directory (as the file_path) but with a different extension. This extension can be given by the user, if not a default extension of 'pklmetadata' is used. Args: data_frame (DataFrame): DataFrame that should be saved file_path (string): File path where the DataFrame must be stored metadata_ext (string): Metadata extension that should be used while storing the metadata information. The default value is '.pklmetadata'. Returns: A boolean value of True is returned if the DataFrame is successfully saved. See Also: save_object, to_csv_metadata. Notes: This function is bit different from to_csv_metadata, where the DataFrame is stored in a CSV file format. The CSV file format can be viewed with a text editor. But save_table is stored in a special format, which cannot be viewed with a text editor. The reason we have save_table is, for larger DataFrames it is efficient to pickle the DataFrame to disk than writing the DataFrame in CSV format. """ # Validate the input parameters # # data_frame is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logging.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # file_path is expected to be of type pandas DataFrame if not isinstance(file_path, six.string_types): logger.error('Input file path is not of type string') raise AssertionError('Input file path is not of type string') # # metadata_ext is expected to be of type string if not isinstance(metadata_ext, six.string_types): logger.error('Input metadata ext is not of type string') raise AssertionError('Input metadata ext is not of type string') # Get the file_name (with out extension) and the extension from the given # file path. For example if the file_path was /Users/foo/file.csv then # the file_name will be /Users/foo/file and the extension will be '.csv' file_name, _ = os.path.splitext(file_path) # The metadata file name is the same file name but with the extension # given by the user metadata_filename = file_name + metadata_ext # Check if the file exists in the file_path and whether we have # sufficient access privileges to write in that path can_write, file_exists = ps._check_file_path(file_path) if can_write: # If the file already exists then issue a warning and overwrite the # file if file_exists: logger.warning('File already exists at %s; Overwriting it', file_path) # we open the file_path in binary mode, as we are writing in # binary format' with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # with open(file_path, 'wb') as file_handler: cloudpickle.dump(data_frame, file_handler) else: # Looks like we cannot write the file in the given path. Raise an # error in this case. logger.error('Cannot write in the file path %s; Exiting', file_path) raise AssertionError('Cannot write in the file path %s', file_path) # Once we are done with writing the DataFrame, we will write the metadata # now # Initialize a metadata dictionary to hold the metadata of DataFrame from # the catalog metadata_dict = collections.OrderedDict() # get all the properties for the input data frame # # Check if the DataFrame information is present in the catalog properties = {} if cm.is_dfinfo_present(data_frame) is True: properties = cm.get_all_properties(data_frame) # If the properties are present in the catalog, then write properties to # disk if len(properties) > 0: for property_name, property_value in six.iteritems(properties): if isinstance(property_value, six.string_types) is True: metadata_dict[property_name] = property_value # try to save metadata can_write, file_exists = ps._check_file_path(metadata_filename) if can_write: # If the file already exists, then issue a warning and overwrite the # file if file_exists: logger.warning( 'Metadata file already exists at %s. Overwriting it', metadata_filename) # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: # write metadata contents with open(metadata_filename, 'wb') as file_handler: cloudpickle.dump(metadata_dict, file_handler) else: logger.warning( 'Cannot write metadata at the file path %s. Skip writing metadata ' 'file', metadata_filename) return True