Exemple #1
0
    def deserialize(
        filepath: str,
        source_directory: str = "filestore",
        destination_directory: str = "system_temp",
        **kwargs,
    ) -> Dict[str, str]:
        DiskIOMethods.copy_file(
            join(FILEPATH_REGISTRY.get(source_directory), filepath),
            join(FILEPATH_REGISTRY.get(destination_directory), filepath),
        )

        return {
            "filepath": filepath,
            "source_directory": destination_directory
        }
Exemple #2
0
    def serialize(
        obj: ddDataFrame,
        filepath: str,
        format_directory: str = JSON_DIRECTORY,
        format_extension: str = ".jsonl",
        destination_directory: str = "system_temp",
        **kwargs,
    ) -> Dict[str, str]:
        # Append the filepath to the storage directory
        # read_json method expects a * format
        destination_folder = FILEPATH_REGISTRY.get(destination_directory)
        filename_format = join(format_directory,
                               filepath + "-*" + format_extension)
        full_path = join(destination_folder, filename_format)
        DaskPersistenceMethods.to_json(obj, full_path)

        written_filepaths = glob.glob(full_path)

        # strip out root path to keep relative to directory
        filepaths = []
        for i in written_filepaths:
            relative_path = i.split(destination_folder)[1]
            # strip the preceding /
            if relative_path[0] == "/":
                relative_path = relative_path[1:]
            filepaths.append(relative_path)

        return {
            "filepaths": filepaths,
            "source_directory": destination_directory
        }
Exemple #3
0
    def test_pandas_datasets(self):
        registered_name = "PandasFileBasedDataset"
        name = "titanic-regression-{test_name}"
        save_patterns = {"dataset": ["{save_pattern}"]}

        for save_pattern in [
                "pandas_disk_json",
                "pandas_disk_csv",
                "pandas_disk_parquet",
        ]:
            with self.subTest("Pandas Dataset Regression with Titanic",
                              save_pattern=save_pattern):
                save_patterns["dataset"] = [save_pattern]
                regression_dataset = self.get_regression_artifact(
                    "dataset", name=name.format(test_name=save_pattern))
                with FILEPATH_REGISTRY.context_register(
                        "filestore", ARTIFACTS_PATH):
                    new_dataset = DatasetCreator.create(
                        registered_name=registered_name,
                        name=name.format(test_name=f"{save_pattern}_new"),
                        save_patterns=save_patterns,
                        **dataset_kwargs_template,
                    )

                self.compare_datasets(new_dataset, regression_dataset)
Exemple #4
0
 def deserialize(filepaths: List[str],
                 source_directory: str = "system_temp",
                 **kwargs) -> Dict[str, Any]:
     full_paths = [
         join(FILEPATH_REGISTRY.get(source_directory), filepath)
         for filepath in filepaths
     ]
     return {"obj": DaskPersistenceMethods.read_json(full_paths)}
Exemple #5
0
    def serialize(
        obj: Any,
        filepath: str,
        format_directory: str = HDF5_DIRECTORY,
        format_extension: str = ".h5",
        destination_directory: str = "system_temp",
        **kwargs,
    ) -> Dict[str, str]:

        # Append the filepath to the storage directory
        filepath = join(format_directory, filepath + format_extension)
        full_path = join(FILEPATH_REGISTRY.get(destination_directory), filepath)
        KerasPersistenceMethods.save_model(obj, full_path, save_format="h5")
        return {"filepath": filepath, "source_directory": destination_directory}
Exemple #6
0
    def serialize(
        obj: ddDataFrame,
        filepath: str,
        format_directory: str = ORC_DIRECTORY,
        format_extension: str = ".orc",
        destination_directory: str = "system_temp",
        **kwargs,
    ) -> Dict[str, str]:

        # Append the filepath to the storage directory
        filepath = join(format_directory, filepath + format_extension)
        full_path = join(FILEPATH_REGISTRY.get(destination_directory),
                         filepath)
        DaskPersistenceMethods.to_orc(obj, full_path)
        return {
            "filepath": filepath,
            "source_directory": destination_directory
        }
Exemple #7
0
    def serialize(
        obj: Any,
        filepath: str,
        format_directory: str = PICKLE_DIRECTORY,
        format_extension: str = ".pkl",
        destination_directory: str = "system_temp",
        **kwargs,
    ) -> Dict[str, str]:

        # Append the filepath to the pickle storage directory
        filepath = join(format_directory, filepath + format_extension)
        full_path = join(FILEPATH_REGISTRY.get(destination_directory), filepath)
        # make sure the directory exists
        makedirs(dirname(full_path), exist_ok=True)

        PicklePersistenceMethods.dump_object(obj, full_path)

        return {"filepath": filepath, "source_directory": destination_directory}
Exemple #8
0
 def deserialize(
     filepath: str, source_directory: str = "system_temp", **kwargs
 ) -> Dict[str, Any]:
     full_path = join(FILEPATH_REGISTRY.get(source_directory), filepath)
     return {"obj": KerasPersistenceMethods.load_model(full_path)}
Exemple #9
0
 def deserialize(filepath: str,
                 source_directory: str = "system_temp",
                 **kwargs) -> Dict[str, pd.DataFrame]:
     full_path = join(FILEPATH_REGISTRY.get(source_directory), filepath)
     return {"obj": PandasPersistenceMethods.read_json(full_path)}
Exemple #10
0
 def get_regression_artifact(self, persistable_type, **filters):
     with FILEPATH_REGISTRY.context_register("filestore", ARTIFACTS_PATH):
         persistable = getattr(PersistableLoader,
                               f"load_{persistable_type}")(**filters)
         persistable.load_external_files()
         return persistable
Exemple #11
0
    LIBCLOUD_ROOT_PATH = ""
    LIBCLOUD_CONFIG_SECTION = None

# Reference paths
PICKLE_DIRECTORY = "pickle/"
HDF5_DIRECTORY = "HDF5/"
PARQUET_DIRECTORY = "parquet/"
CSV_DIRECTORY = "csv/"
ORC_DIRECTORY = "orc/"
JSON_DIRECTORY = "json/"
TENSORFLOW_SAVED_MODEL_DIRECTORY = "saved_model/"
FILESTORE_DIRECTORY = os.path.join(SIMPLEML_DIRECTORY, "filestore/")
SYSTEM_TEMP_DIRECTORY = tempfile.gettempdir()

# register paths for consistent reference
FILEPATH_REGISTRY.register("filestore", FILESTORE_DIRECTORY)
FILEPATH_REGISTRY.register("system_temp", SYSTEM_TEMP_DIRECTORY)
FILEPATH_REGISTRY.register("libcloud_root_path", LIBCLOUD_ROOT_PATH)


# Create Paths if they don't exist - use try/excepts to catch race conditions
def safe_makedirs(dir):
    try:
        os.makedirs(dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise


if not os.path.exists(SIMPLEML_DIRECTORY):
    safe_makedirs(SIMPLEML_DIRECTORY)