def download_raw_dataset(self): """Download the raw dataset files and store in the cache location.""" with upload_output_directory(self.raw_dataset_path) as (tmpdir, _): for url in self.download_url: filename = url.split("/")[-1] fs, _ = get_fs_and_path(url) fs.get(url, os.path.join(tmpdir, filename), recursive=True)
def download_artifacts( bench_config: Dict[str, Any], base_experiment: str, experimental_experiment: str, download_base_path: str) -> List[Union[Tuple[str, str], Any]]: """Download benchmarking artifacts for two experiments. bench_config: bench config file. Can be the same one that was used to run these experiments. base_experiment: name of the experiment we're comparing against. experimental_experiment: name of the experiment we're comparing. download_base_path: base path under which live the stored artifacts of the benchmarking experiments. """ protocol, _ = fsspec.core.split_protocol(download_base_path) fs, _ = get_fs_and_path(download_base_path) local_dir = os.path.join(os.getcwd(), "visualize-temp") os.makedirs(local_dir, exist_ok=True) coroutines = [] for experiment in bench_config["datasets"]: dataset_name = experiment["dataset_name"] for experiment_name in [base_experiment, experimental_experiment]: coroutines.append( download_one(fs, download_base_path, dataset_name, experiment_name, local_dir)) loop = asyncio.get_event_loop() futures = asyncio.gather(*coroutines, return_exceptions=True) downloaded_names = loop.run_until_complete(futures) loop.close() return downloaded_names
def create_file(url): _, path = get_fs_and_path(url) logging.info(f"saving url '{url}' to path '{path}'") with tempfile.TemporaryDirectory() as tmpdir: file_path = os.path.join(tmpdir, path) os.makedirs(os.path.dirname(file_path)) with open(file_path, "w"): return path
def export_artifacts(experiment: Dict[str, str], report_path: str, experiment_output_directory: str, export_base_path: str) -> None: """Save the experiment artifacts to the `bench_export_directory`. experiment: experiment dict that contains "dataset_name" (e.g. ames_housing), "experiment_name" (specified by user), and "config_path" (path to experiment config. Relative to ludwig/benchmarks/configs). report_path: path where the experiment metrics report is saved. experiment_output_directory: path where the model, data, and logs of the experiment are saved. export_base_path: remote or local path (directory) where artifacts are exported. (e.g. s3://benchmarking.us-west-2.ludwig.com/bench/ or your/local/bench/) """ protocol, _ = fsspec.core.split_protocol(export_base_path) fs, _ = get_fs_and_path(export_base_path) try: export_full_path = os.path.join(export_base_path, experiment["dataset_name"], experiment["experiment_name"]) fs.put(report_path, os.path.join(export_full_path, REPORT_JSON), recursive=True) fs.put( os.path.join("configs", experiment["config_path"]), os.path.join(export_full_path, CONFIG_YAML), recursive=True, ) fs.put( os.path.join(experiment["dataset_name"], EXPERIMENT_RUN, "model", MODEL_HYPERPARAMETERS_FILE_NAME), os.path.join(export_full_path, MODEL_HYPERPARAMETERS_FILE_NAME), recursive=True, ) # zip experiment directory to export try: shutil.make_archive("artifacts", "zip", experiment_output_directory) fs.put("artifacts.zip", os.path.join(export_full_path, "artifacts.zip"), recursive=True) os.remove("artifacts.zip") except Exception as e: logging.error( f"Couldn't export '{experiment_output_directory}' to bucket") logging.error(e) print("Uploaded metrics report and experiment config to\n\t", export_full_path) except ClientError as e: logging.error(translate_boto_error(e))
def read_remote_parquet(path: str): fs, path = get_fs_and_path(path) return read_parquet(path, filesystem=PyFileSystem(FSSpecHandler(fs)))