def deserialize( filepath: str, source_directory: str = "filestore", destination_directory: str = "system_temp", **kwargs, ) -> Dict[str, str]: DiskIOMethods.copy_file( join(FILEPATH_REGISTRY.get(source_directory), filepath), join(FILEPATH_REGISTRY.get(destination_directory), filepath), ) return { "filepath": filepath, "source_directory": destination_directory }
def serialize( obj: ddDataFrame, filepath: str, format_directory: str = JSON_DIRECTORY, format_extension: str = ".jsonl", destination_directory: str = "system_temp", **kwargs, ) -> Dict[str, str]: # Append the filepath to the storage directory # read_json method expects a * format destination_folder = FILEPATH_REGISTRY.get(destination_directory) filename_format = join(format_directory, filepath + "-*" + format_extension) full_path = join(destination_folder, filename_format) DaskPersistenceMethods.to_json(obj, full_path) written_filepaths = glob.glob(full_path) # strip out root path to keep relative to directory filepaths = [] for i in written_filepaths: relative_path = i.split(destination_folder)[1] # strip the preceding / if relative_path[0] == "/": relative_path = relative_path[1:] filepaths.append(relative_path) return { "filepaths": filepaths, "source_directory": destination_directory }
def test_pandas_datasets(self): registered_name = "PandasFileBasedDataset" name = "titanic-regression-{test_name}" save_patterns = {"dataset": ["{save_pattern}"]} for save_pattern in [ "pandas_disk_json", "pandas_disk_csv", "pandas_disk_parquet", ]: with self.subTest("Pandas Dataset Regression with Titanic", save_pattern=save_pattern): save_patterns["dataset"] = [save_pattern] regression_dataset = self.get_regression_artifact( "dataset", name=name.format(test_name=save_pattern)) with FILEPATH_REGISTRY.context_register( "filestore", ARTIFACTS_PATH): new_dataset = DatasetCreator.create( registered_name=registered_name, name=name.format(test_name=f"{save_pattern}_new"), save_patterns=save_patterns, **dataset_kwargs_template, ) self.compare_datasets(new_dataset, regression_dataset)
def deserialize(filepaths: List[str], source_directory: str = "system_temp", **kwargs) -> Dict[str, Any]: full_paths = [ join(FILEPATH_REGISTRY.get(source_directory), filepath) for filepath in filepaths ] return {"obj": DaskPersistenceMethods.read_json(full_paths)}
def serialize( obj: Any, filepath: str, format_directory: str = HDF5_DIRECTORY, format_extension: str = ".h5", destination_directory: str = "system_temp", **kwargs, ) -> Dict[str, str]: # Append the filepath to the storage directory filepath = join(format_directory, filepath + format_extension) full_path = join(FILEPATH_REGISTRY.get(destination_directory), filepath) KerasPersistenceMethods.save_model(obj, full_path, save_format="h5") return {"filepath": filepath, "source_directory": destination_directory}
def serialize( obj: ddDataFrame, filepath: str, format_directory: str = ORC_DIRECTORY, format_extension: str = ".orc", destination_directory: str = "system_temp", **kwargs, ) -> Dict[str, str]: # Append the filepath to the storage directory filepath = join(format_directory, filepath + format_extension) full_path = join(FILEPATH_REGISTRY.get(destination_directory), filepath) DaskPersistenceMethods.to_orc(obj, full_path) return { "filepath": filepath, "source_directory": destination_directory }
def serialize( obj: Any, filepath: str, format_directory: str = PICKLE_DIRECTORY, format_extension: str = ".pkl", destination_directory: str = "system_temp", **kwargs, ) -> Dict[str, str]: # Append the filepath to the pickle storage directory filepath = join(format_directory, filepath + format_extension) full_path = join(FILEPATH_REGISTRY.get(destination_directory), filepath) # make sure the directory exists makedirs(dirname(full_path), exist_ok=True) PicklePersistenceMethods.dump_object(obj, full_path) return {"filepath": filepath, "source_directory": destination_directory}
def deserialize( filepath: str, source_directory: str = "system_temp", **kwargs ) -> Dict[str, Any]: full_path = join(FILEPATH_REGISTRY.get(source_directory), filepath) return {"obj": KerasPersistenceMethods.load_model(full_path)}
def deserialize(filepath: str, source_directory: str = "system_temp", **kwargs) -> Dict[str, pd.DataFrame]: full_path = join(FILEPATH_REGISTRY.get(source_directory), filepath) return {"obj": PandasPersistenceMethods.read_json(full_path)}
def get_regression_artifact(self, persistable_type, **filters): with FILEPATH_REGISTRY.context_register("filestore", ARTIFACTS_PATH): persistable = getattr(PersistableLoader, f"load_{persistable_type}")(**filters) persistable.load_external_files() return persistable
LIBCLOUD_ROOT_PATH = "" LIBCLOUD_CONFIG_SECTION = None # Reference paths PICKLE_DIRECTORY = "pickle/" HDF5_DIRECTORY = "HDF5/" PARQUET_DIRECTORY = "parquet/" CSV_DIRECTORY = "csv/" ORC_DIRECTORY = "orc/" JSON_DIRECTORY = "json/" TENSORFLOW_SAVED_MODEL_DIRECTORY = "saved_model/" FILESTORE_DIRECTORY = os.path.join(SIMPLEML_DIRECTORY, "filestore/") SYSTEM_TEMP_DIRECTORY = tempfile.gettempdir() # register paths for consistent reference FILEPATH_REGISTRY.register("filestore", FILESTORE_DIRECTORY) FILEPATH_REGISTRY.register("system_temp", SYSTEM_TEMP_DIRECTORY) FILEPATH_REGISTRY.register("libcloud_root_path", LIBCLOUD_ROOT_PATH) # Create Paths if they don't exist - use try/excepts to catch race conditions def safe_makedirs(dir): try: os.makedirs(dir) except OSError as e: if e.errno != errno.EEXIST: raise if not os.path.exists(SIMPLEML_DIRECTORY): safe_makedirs(SIMPLEML_DIRECTORY)