def test_unexpected_params() -> None: config = { "type": "noop", "base_path": "test", "require": "value", "optional": "test" } with pytest.raises(TypeError, match="unexpected keyword argument " "'require'"): storage.build(config)
def to_delete(request: Any, config: Dict[str, Any]) -> List[Dict[str, Any]]: manager = storage.build(config["checkpoint_storage"]) metadata = [manager.store(StorableFixture()) for _ in range(request.param)] host_path = config["checkpoint_storage"]["host_path"] assert len(os.listdir(host_path)) == request.param return [simplejson.loads(util.json_encode(m)) for m in metadata]
def download(master: str, trial_id: int, step_id: int, output_dir: str) -> None: q = api.GraphQLQuery(master) step = q.op.steps_by_pk(trial_id=trial_id, id=step_id) step.checkpoint.labels() step.checkpoint.resources() step.checkpoint.uuid() step.trial.experiment.config(path="checkpoint_storage") step.trial.experiment_id() resp = q.send() step = resp.steps_by_pk if not step: raise ValueError("Trial {} step {} not found".format(trial_id, step_id)) if not step.checkpoint: raise ValueError("Trial {} step {} has no checkpoint".format(trial_id, step_id)) storage_config = step.trial.experiment.config manager = storage.build(storage_config) if not ( isinstance(manager, storage.S3StorageManager) or isinstance(manager, storage.GCSStorageManager) ): raise AssertionError( "Downloading from S3 or GCS requires the experiment to be configured with " "S3 or GCS checkpointing, {} found instead".format(storage_config["type"]) ) metadata = storage.StorageMetadata.from_json(step.checkpoint.__to_json_value__()) manager.download(metadata, output_dir)
def build_and_run_training_pipeline(env: det.EnvContext) -> None: # Create the socket manager. The socket manager will connect to the master and read messages # until it receives the rendezvous_info. # # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer. with layers.SocketManager(env) as socket_mgr: # Create the storage manager. This is used to download the initial checkpoint here in # build_training_pipeline and also used by the workload manager to create and store # checkpoints during training. storage_mgr = storage.build(env.experiment_config["checkpoint_storage"]) [tensorboard_mgr, tensorboard_writer] = load.prepare_tensorboard(env) # Create the workload manager. The workload manager will receive workloads from the # socket_mgr, and augment them with some additional arguments. Additionally, the # workload manager is responsible for some generic workload hooks for things like timing # workloads, preparing checkpoints, and uploading completed checkpoints. Finally, the # workload manager does some sanity checks on response messages that originate from the # trial. # # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task. workload_mgr = layers.build_workload_manager( env, iter(socket_mgr), socket_mgr.get_rendezvous_info(), storage_mgr, tensorboard_mgr, tensorboard_writer, ) hvd_config = horovod.HorovodContext.from_configs( env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams ) logging.info(f"Horovod config: {hvd_config.__dict__}.") # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access # to this checkpoint. with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path: # Horovod distributed training is done inside subprocesses. if hvd_config.use: subproc = layers.SubprocessLauncher( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config ) subproc.run() else: if env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) controller = load.prepare_controller( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config, ) controller.run()
def test_setting_optional_variable() -> None: config = { "type": "noop", "base_path": "test", "required": "value", "optional": "test" } manager = storage.build(config) assert isinstance(manager, NoopStorageManager) assert manager.required == "value" assert manager.optional == "test"
def main(argv: List[str]) -> None: parser = argparse.ArgumentParser(description="Determined checkpoint GC") parser.add_argument( "--version", action="version", version="Determined checkpoint GC, version {}".format(det.__version__), ) parser.add_argument( "--log-level", default=os.getenv("DET_LOG_LEVEL", "INFO"), choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Set the logging level", ) parser.add_argument( "--experiment-config", type=json_file_arg, default=os.getenv("DET_EXPERIMENT_CONFIG", {}), help="Experiment config (JSON-formatted file)", ) parser.add_argument( "--delete", type=json_file_arg, default=os.getenv("DET_DELETE", []), help="Checkpoints to delete (JSON-formatted file)", ) parser.add_argument( "--dry-run", action="store_true", default=("DET_DRY_RUN" in os.environ), help="Do not actually delete any checkpoints from storage", ) args = parser.parse_args(argv) logging.basicConfig( level=args.log_level, format="%(asctime)s:%(module)s:%(levelname)s: %(message)s") logging.info("Determined checkpoint GC, version {}".format( det.__version__)) storage_config = args.experiment_config["checkpoint_storage"] logging.info("Using checkpoint storage: {}".format(storage_config)) manager = storage.build(storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH) delete_checkpoints(manager, args.delete["checkpoints"], dry_run=args.dry_run)
def download(self, path: Optional[str] = None) -> str: """ Download checkpoint from the checkpoint storage location locally. Arguments: path (string, optional): Top level directory to place the checkpoint under. If this parameter is not set the checkpoint will be downloaded to `checkpoints/<checkpoint_uuid>` relative to the current working directory. """ if path is not None: local_ckpt_dir = pathlib.Path(path) else: local_ckpt_dir = pathlib.Path("checkpoints", self.uuid) # If the target directory doesn't already appear to contain a # checkpoint, attempt to fetch one. # We used MLflow's MLmodel checkpoint format in the past for # serializing pytorch models. We now use our own format that contains a # metadata.json file. We are checking for checkpoint existence by # looking for both checkpoint formats in the output directory. potential_metadata_paths = [ local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"] ] if not any(p.exists() for p in potential_metadata_paths): if self.storage_config["type"] == "shared_fs": src_ckpt_dir = self._find_shared_fs_path() shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir)) else: local_ckpt_dir.mkdir(parents=True, exist_ok=True) manager = storage.build(self.storage_config) if not isinstance( manager, (storage.S3StorageManager, storage.GCSStorageManager)): raise AssertionError( "Downloading from S3 or GCS requires the experiment to be configured with " "S3 or GCS checkpointing, {} found instead".format( self.storage_config["type"])) metadata = storage.StorageMetadata.from_json({ "uuid": self.uuid, "resources": self.resources }) manager.download(metadata, str(local_ckpt_dir)) return str(local_ckpt_dir)
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {8, 9, 10}, "DELETED": {1, 2, 3, 4, 5, 6, 7} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {1, 2, 3, 9, 10}, "DELETED": {4, 5, 6, 7, 8} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step["checkpoint"] for step in trials[0]["steps"]), key=operator.itemgetter("step_id"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint["state"], set()).add(checkpoint["step_id"]) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs": deleted_exception = check.CheckFailedError elif checkpoint_config["type"] == "s3": deleted_exception = botocore.exceptions.ClientError else: raise NotImplementedError( f'unsupported storage type {checkpoint_config["type"]}' ) storage_manager = storage.build(checkpoint_config, container_path=None) for checkpoint in checkpoints: metadata = storage.StorageMetadata.from_json(checkpoint) if checkpoint["state"] == "COMPLETED": with storage_manager.restore_path(metadata): pass elif checkpoint["state"] == "DELETED": try: with storage_manager.restore_path(metadata): raise AssertionError("checkpoint not deleted") except deleted_exception: pass except AssertionError: if i == max_checks - 1: raise else: break
def test_unknown_type() -> None: config = {"type": "unknown"} with pytest.raises(TypeError, match="Unknown storage type: unknown"): storage.build(config)
def test_build_with_container_path() -> None: config = {"type": "shared_fs", "host_path": "/host_path", "storage_path": "storage_path"} manager = storage.build(config) assert manager._base_path == "/host_path/storage_path" manager = storage.build(config, container_path="/container_path") assert manager._base_path == "/container_path/storage_path"
def test_illegal_type() -> None: config = {"type": 4} with pytest.raises(CheckFailedError, match="must be a string"): storage.build(config)
def test_missing_type() -> None: with pytest.raises(CheckFailedError, match="Missing 'type' parameter"): storage.build({})
def download(self, path: Optional[str] = None) -> str: """ Download checkpoint to local storage. Arguments: path (string, optional): Top level directory to place the checkpoint under. If this parameter is not set, the checkpoint will be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the current working directory. """ if path is not None: local_ckpt_dir = pathlib.Path(path) else: local_ckpt_dir = pathlib.Path("checkpoints", self.uuid) # Backward compatibility: we used MLflow's MLmodel checkpoint format for # serializing pytorch models. We now use our own format that contains a # metadata.json file. We are checking for checkpoint existence by # looking for both checkpoint formats in the output directory. potential_metadata_paths = [ local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"] ] if not any(p.exists() for p in potential_metadata_paths): # If the target directory doesn't already appear to contain a # checkpoint, attempt to fetch one. if self.experiment_config["checkpoint_storage"][ "type"] == "shared_fs": src_ckpt_dir = self._find_shared_fs_path() shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir)) else: local_ckpt_dir.mkdir(parents=True, exist_ok=True) manager = storage.build( self.experiment_config["checkpoint_storage"], container_path=None, ) if not isinstance( manager, (storage.S3StorageManager, storage.GCSStorageManager)): raise AssertionError( "Downloading from S3 or GCS requires the experiment to be configured with " "S3 or GCS checkpointing, {} found instead".format( self.experiment_config["checkpoint_storage"] ["type"])) metadata = storage.StorageMetadata.from_json({ "uuid": self.uuid, "resources": self.resources }) manager.download(metadata, str(local_ckpt_dir)) if not local_ckpt_dir.joinpath("metadata.json").exists(): with open(local_ckpt_dir.joinpath("metadata.json"), "w") as f: json.dump( { "determined_version": self.determined_version, "framework": self.framework, "format": self.format, "experiment_id": self.experiment_id, "trial_id": self.trial_id, "hparams": self.hparams, "experiment_config": self.experiment_config, "metadata": self.metadata, }, f, indent=2, ) return str(local_ckpt_dir)
def test_missing_required_variable() -> None: config = {"type": "noop", "base_path": "test"} with pytest.raises(TypeError, match="missing 1 required positional " "argument: 'required'"): storage.build(config)
def test_getting_manager_instance() -> None: config = {"type": "noop", "base_path": "test", "required": "value"} manager = storage.build(config) assert isinstance(manager, NoopStorageManager) assert manager.required == "value" assert manager.optional == "default"
def list(args: Namespace) -> None: q = api.GraphQLQuery(args.master) q.op.experiments_by_pk(id=args.experiment_id).config(path="checkpoint_storage") order_by = [ gql.checkpoints_order_by( validation=gql.validations_order_by( metric_values=gql.validation_metrics_order_by(signed=gql.order_by.asc) ) ) ] limit = None if args.best is not None: if args.best < 0: raise AssertionError("--best must be a non-negative integer") limit = args.best checkpoints = q.op.checkpoints( where=gql.checkpoints_bool_exp( step=gql.steps_bool_exp( trial=gql.trials_bool_exp( experiment_id=gql.Int_comparison_exp(_eq=args.experiment_id) ) ) ), order_by=order_by, limit=limit, ) checkpoints.end_time() checkpoints.labels() checkpoints.resources() checkpoints.start_time() checkpoints.state() checkpoints.step_id() checkpoints.trial_id() checkpoints.uuid() checkpoints.step.validation.metric_values.raw() resp = q.send() config = resp.experiments_by_pk.config headers = ["Trial ID", "Step ID", "State", "Validation Metric", "UUID", "Resources", "Size"] values = [ [ c.trial_id, c.step_id, c.state, c.step.validation.metric_values.raw if c.step.validation and c.step.validation.metric_values else None, c.uuid, render.format_resources(c.resources), render.format_resource_sizes(c.resources), ] for c in resp.checkpoints ] render.tabulate_or_csv(headers, values, args.csv) if args.download_dir is not None: manager = storage.build(config) if not ( isinstance(manager, storage.S3StorageManager) or isinstance(manager, storage.GCSStorageManager) ): print( "Downloading from S3 or GCS requires the experiment to be configured with " "S3 or GCS checkpointing, {} found instead".format(config["type"]) ) sys.exit(1) for checkpoint in resp.checkpoints: metadata = storage.StorageMetadata.from_json(checkpoint.__to_json_value__()) ckpt_dir = args.download_dir.joinpath( "exp-{}-trial-{}-step-{}".format( args.experiment_id, checkpoint.trial_id, checkpoint.step_id ) ) print("Downloading checkpoint {} to {}".format(checkpoint.uuid, ckpt_dir)) manager.download(metadata, ckpt_dir)