def test_build_with_container_path() -> None: config = { "type": "shared_fs", "host_path": "/host_path", "storage_path": "storage_path" } manager = storage.build(config, container_path=None) assert manager._base_path == "/host_path/storage_path" manager = storage.build(config, container_path="/container_path") assert manager._base_path == "/container_path/storage_path"
def main(argv: List[str]) -> None: parser = argparse.ArgumentParser(description="Determined checkpoint GC") parser.add_argument( "--version", action="version", version="Determined checkpoint GC, version {}".format(det.__version__), ) parser.add_argument("--experiment-id", help="The experiment ID to run the GC job for") parser.add_argument( "--log-level", default=os.getenv("DET_LOG_LEVEL", "INFO"), choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Set the logging level", ) parser.add_argument( "--storage-config", type=json_file_arg, default=os.getenv("DET_STORAGE_CONFIG", {}), help="Storage config (JSON-formatted file)", ) parser.add_argument( "--delete", type=json_file_arg, default=os.getenv("DET_DELETE", []), help="Checkpoints to delete (JSON-formatted file)", ) parser.add_argument( "--delete-tensorboards", action="store_true", default=os.getenv("DET_DELETE_TENSORBOARDS", False), help="Delete Tensorboards from storage", ) parser.add_argument( "--dry-run", action="store_true", default=("DET_DRY_RUN" in os.environ), help="Do not actually delete any checkpoints from storage", ) args = parser.parse_args(argv) logging.basicConfig( level=args.log_level, format="%(asctime)s:%(module)s:%(levelname)s: %(message)s" ) logging.info("Determined checkpoint GC, version {}".format(det.__version__)) storage_config = args.storage_config logging.info("Using checkpoint storage: {}".format(storage_config)) manager = storage.build(storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH) storage_ids = [c["uuid"] for c in args.delete["checkpoints"]] delete_checkpoints(manager, storage_ids, dry_run=args.dry_run) if args.delete_tensorboards: tb_manager = tensorboard.build( os.environ["DET_CLUSTER_ID"], args.experiment_id, None, storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH, ) delete_tensorboards(tb_manager, dry_run=args.dry_run)
def build_and_run_training_pipeline(env: det.EnvContext) -> None: # Create the socket manager. The socket manager will connect to the master and read messages # until it receives the rendezvous_info. # # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer. with layers.SocketManager(env) as socket_mgr: # Create the storage manager. This is used to download the initial checkpoint here in # build_training_pipeline and also used by the workload manager to create and store # checkpoints during training. storage_mgr = storage.build( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) [tensorboard_mgr, tensorboard_writer ] = load.prepare_tensorboard(env, constants.SHARED_FS_CONTAINER_PATH) # Create the workload manager. The workload manager will receive workloads from the # socket_mgr, and augment them with some additional arguments. Additionally, the # workload manager is responsible for some generic workload hooks for things like timing # workloads, preparing checkpoints, and uploading completed checkpoints. Finally, the # workload manager does some sanity checks on response messages that originate from the # trial. # # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task. workload_mgr = layers.build_workload_manager( env, iter(socket_mgr), socket_mgr.get_rendezvous_info(), storage_mgr, tensorboard_mgr, tensorboard_writer, ) workloads = iter(workload_mgr) hvd_config = horovod.HorovodContext.from_configs( env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams) logging.info(f"Horovod config: {hvd_config.__dict__}.") # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access # to this checkpoint. with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path: # Horovod distributed training is done inside subprocesses. if hvd_config.use: subproc = layers.SubprocessLauncher( env, workloads, load_path, socket_mgr.get_rendezvous_info(), hvd_config) subproc.run() else: if env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) with det._catch_sys_exit(): with det._catch_init_invalid_hp(workloads): controller = load.prepare_controller( env, workloads, load_path, socket_mgr.get_rendezvous_info(), hvd_config, ) controller.run()
def download(self, path: Optional[str] = None) -> str: """ Download checkpoint to local storage. Arguments: path (string, optional): Top level directory to place the checkpoint under. If this parameter is not set, the checkpoint will be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the current working directory. """ if path is not None: local_ckpt_dir = pathlib.Path(path) else: local_ckpt_dir = pathlib.Path("checkpoints", self.uuid) # Backward compatibility: we used MLflow's MLmodel checkpoint format for # serializing pytorch models. We now use our own format that contains a # metadata.json file. We are checking for checkpoint existence by # looking for both checkpoint formats in the output directory. potential_metadata_paths = [ local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"] ] if not any(p.exists() for p in potential_metadata_paths): # If the target directory doesn't already appear to contain a # checkpoint, attempt to fetch one. if self.experiment_config["checkpoint_storage"][ "type"] == "shared_fs": src_ckpt_dir = self._find_shared_fs_path() shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir)) else: local_ckpt_dir.mkdir(parents=True, exist_ok=True) manager = storage.build( self.experiment_config["checkpoint_storage"], container_path=None, ) if not isinstance( manager, (storage.S3StorageManager, storage.GCSStorageManager)): raise AssertionError( "Downloading from S3 or GCS requires the experiment to be configured with " "S3 or GCS checkpointing, {} found instead".format( self.experiment_config["checkpoint_storage"] ["type"])) metadata = storage.StorageMetadata.from_json({ "uuid": self.uuid, "resources": self.resources }) manager.download(metadata, str(local_ckpt_dir)) if not local_ckpt_dir.joinpath("metadata.json").exists(): with open(local_ckpt_dir.joinpath("metadata.json"), "w") as f: json.dump( { "determined_version": self.determined_version, "framework": self.framework, "format": self.format, "experiment_id": self.experiment_id, "trial_id": self.trial_id, "hparams": self.hparams, "experiment_config": self.experiment_config, "metadata": self.metadata, }, f, indent=2, ) return str(local_ckpt_dir)
def download(self, path: Optional[str] = None) -> str: """ Download checkpoint to local storage. See also: - :func:`determined.pytorch.load_trial_from_checkpoint_path` - :func:`determined.keras.load_model_from_checkpoint_path` - :func:`determined.estimator.load_estimator_from_checkpoint_path` Arguments: path (string, optional): Top level directory to place the checkpoint under. If this parameter is not set, the checkpoint will be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the current working directory. """ if path is not None: local_ckpt_dir = pathlib.Path(path) else: local_ckpt_dir = pathlib.Path("checkpoints", self.uuid) # Backward compatibility: we used MLflow's MLmodel checkpoint format for # serializing pytorch models. We now use our own format that contains a # metadata.json file. We are checking for checkpoint existence by # looking for both checkpoint formats in the output directory. potential_metadata_paths = [ local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"] ] if not any(p.exists() for p in potential_metadata_paths): # If the target directory doesn't already appear to contain a # checkpoint, attempt to fetch one. if self.training is None: raise NotImplementedError( "Non-training checkpoints cannot be downloaded") checkpoint_storage = self.training.experiment_config[ "checkpoint_storage"] if checkpoint_storage["type"] == "shared_fs": src_ckpt_dir = self._find_shared_fs_path(checkpoint_storage) shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir)) else: local_ckpt_dir.mkdir(parents=True, exist_ok=True) manager = storage.build( checkpoint_storage, container_path=None, ) if not isinstance( manager, ( storage.S3StorageManager, storage.GCSStorageManager, storage.AzureStorageManager, ), ): raise AssertionError( "Downloading from Azure, S3 or GCS requires the experiment to be " "configured with Azure, S3 or GCS checkpointing, {} found instead" .format(checkpoint_storage["type"])) manager.download(self.uuid, str(local_ckpt_dir)) # As of v0.18.0, we write metadata.json once at upload time. Checkpoints uploaded prior to # 0.18.0 will not have a metadata.json present. Unfortunately, checkpoints earlier than # 0.17.7 depended on this file existing in order to be loaded. Therefore, when we detect # that the metadata.json file is not present, we write it to make sure those checkpoints can # still load. metadata_path = local_ckpt_dir.joinpath("metadata.json") if not metadata_path.exists(): self.write_metadata_file(str(metadata_path)) return str(local_ckpt_dir)
def test_delete_checkpoints() -> None: base_conf_path = conf.fixtures_path("no_op/single-default-ckpt.yaml") config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"] = { "type": "shared_fs", "host_path": "/tmp", "storage_path": "delete-checkpoints-e2etest", "save_trial_latest": 10, } config["min_checkpoint_period"] = {"batches": 10} exp_id_1 = exp.run_basic_test_with_temp_config( config, model_def_path=conf.fixtures_path("no_op"), expected_trials=1) exp_id_2 = exp.run_basic_test_with_temp_config( config, model_def_path=conf.fixtures_path("no_op"), expected_trials=1) wait_for_gc_to_finish(exp_id_1) wait_for_gc_to_finish(exp_id_2) test_session = exp.determined_test_session() exp_1_checkpoints = bindings.get_GetExperimentCheckpoints( session=test_session, id=exp_id_1).checkpoints exp_2_checkpoints = bindings.get_GetExperimentCheckpoints( session=test_session, id=exp_id_1).checkpoints assert len(exp_1_checkpoints ) > 0, f"no checkpoints found in experiment with ID:{exp_id_1}" assert len(exp_2_checkpoints ) > 0, f"no checkpoints found in experiment with ID:{exp_id_2}" d_exp_1_checkpoint_uuids = [ exp_1_checkpoints[d_index].uuid for d_index in random.sample(range(len(exp_1_checkpoints)), 2) ] d_exp_2_checkpoint_uuids = [ exp_2_checkpoints[d_index].uuid for d_index in random.sample(range(len(exp_2_checkpoints)), 2) ] d_checkpoint_uuids = d_exp_1_checkpoint_uuids + d_exp_2_checkpoint_uuids print(f"checkpoints uuids to be deleteted: {d_checkpoint_uuids}") # ensure checkpoint directories exist: checkpoint_config = config["checkpoint_storage"] storage_manager = storage.build(checkpoint_config, container_path=None) for uuid in d_checkpoint_uuids: try: storage_manager.restore_path(uuid) except errors.CheckpointNotFound: pytest.fail( f"checkpoint directory with uuid: {uuid} was not created.") delete_body = bindings.v1DeleteCheckpointsRequest( checkpointUuids=d_checkpoint_uuids) bindings.delete_DeleteCheckpoints(session=test_session, body=delete_body) wait_for_gc_to_finish(exp_id_1) wait_for_gc_to_finish(exp_id_2) for d_c in d_checkpoint_uuids: ensure_checkpoint_deleted(test_session, d_c, storage_manager)
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): {800, 900, 1000}, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 100, 200, 300, 400, 500, 600, 700, }, }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { (bindings.determinedexperimentv1State.STATE_COMPLETED.value): { 100, 200, 300, 900, 1000, }, (bindings.determinedexperimentv1State.STATE_DELETED.value): { 400, 500, 600, 700, 800, }, }, ), ] all_checkpoints: List[Tuple[Any, List[bindings.v1CheckpointWorkload]]] = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED) # In some configurations, checkpoint GC will run on an auxillary machine, which may have to # be spun up still. So we'll wait for it to run. wait_for_gc_to_finish(experiment_id) # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 cpoints = exp.workloads_with_checkpoint(trials[0].workloads) sorted_checkpoints = sorted( cpoints, key=lambda ckp: int(ckp.totalBatches), ) assert len(sorted_checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for ckpt in sorted_checkpoints: by_state.setdefault(ckpt.state.value, set()).add(ckpt.totalBatches) if by_state == result: all_checkpoints.append((config, sorted_checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: storage_states = [] for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] storage_manager = storage.build(checkpoint_config, container_path=None) storage_state = {} # type: Dict[str, Any] for checkpoint in checkpoints: assert checkpoint.uuid is not None storage_id = checkpoint.uuid storage_state[storage_id] = {} if checkpoint.state == bindings.determinedcheckpointv1State.STATE_COMPLETED: storage_state[storage_id]["found"] = False try: with storage_manager.restore_path(storage_id): storage_state[storage_id]["found"] = True except errors.CheckpointNotFound: pass elif checkpoint.state == bindings.determinedcheckpointv1State.STATE_DELETED: storage_state[storage_id] = { "deleted": False, "checkpoint": checkpoint } try: with storage_manager.restore_path(storage_id): pass except errors.CheckpointNotFound: storage_state[storage_id]["deleted"] = True storage_states.append(storage_state) for storage_state in storage_states: for state in storage_state.values(): if state.get("deleted", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not deleted: JSON:{json_states}" ) if state.get("found", None) is False: json_states = json.dumps(storage_states) raise AssertionError( f"Some checkpoints were not found: JSON:{json_states}" ) except AssertionError: if i == max_checks - 1: raise else: break
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None: fixtures = [ ( conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"), { "COMPLETED": {800, 900, 1000}, "DELETED": {100, 200, 300, 400, 500, 600, 700} }, ), ( conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"), { "COMPLETED": {100, 200, 300, 900, 1000}, "DELETED": {400, 500, 600, 700, 800} }, ), ] all_checkpoints = [] for base_conf_path, result in fixtures: config = conf.load_config(str(base_conf_path)) config["checkpoint_storage"].update(checkpoint_storage) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op")) exp.wait_for_experiment_state(experiment_id, "COMPLETED") # Checkpoints are not marked as deleted until gc_checkpoint task starts. retries = 5 for retry in range(retries): trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 checkpoints = sorted( (step["checkpoint"] for step in trials[0]["steps"]), key=operator.itemgetter("total_batches"), ) assert len(checkpoints) == 10 by_state = {} # type: Dict[str, Set[int]] for checkpoint in checkpoints: by_state.setdefault(checkpoint["state"], set()).add(checkpoint["total_batches"]) if by_state == result: all_checkpoints.append((config, checkpoints)) break if retry + 1 == retries: assert by_state == result time.sleep(1) # Check that the actual checkpoint storage (for shared_fs) reflects the # deletions. We want to wait for the GC containers to exit, so check # repeatedly with a timeout. max_checks = 30 for i in range(max_checks): time.sleep(1) try: for config, checkpoints in all_checkpoints: checkpoint_config = config["checkpoint_storage"] if checkpoint_config["type"] == "shared_fs": deleted_exception = check.CheckFailedError elif checkpoint_config["type"] == "s3": deleted_exception = botocore.exceptions.ClientError else: raise NotImplementedError( f'unsupported storage type {checkpoint_config["type"]}' ) storage_manager = storage.build(checkpoint_config, container_path=None) for checkpoint in checkpoints: metadata = storage.StorageMetadata.from_json(checkpoint) if checkpoint["state"] == "COMPLETED": with storage_manager.restore_path(metadata): pass elif checkpoint["state"] == "DELETED": try: with storage_manager.restore_path(metadata): raise AssertionError("checkpoint not deleted") except deleted_exception: pass except AssertionError: if i == max_checks - 1: raise else: break
def init( *, distributed: Optional[core.DistributedContext] = None, # TODO: figure out a better way to deal with checkpointing in the local training case. storage_manager: Optional[storage.StorageManager] = None, preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief, tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO, ) -> Context: """ ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core API. Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly. Certain components of the Core API may be configured by passing arguments to ``core.init()``. The only arg that is required is a ``DistributedContext``, and even that is only required for for multi-slot tasks. All of your training must occur within the scope of the ``with core.init() as core_context``, as there are resources necessary for training which start in the ``core.Context``'s ``__enter__`` method and must be cleaned up in its ``__exit__()`` method. Arguments: distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is required for multi-slot training, but unnecessary for single-slot training. Defaults to ``None``. preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the ``core_context.preempt.should_preempt()`` method. See :class:`~determined.core.PreemptMode` for more detail. Defaults to ``WorkersAskChief``. storage_manager: Internal use only. tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard metrics and profiling data are retained. See :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``. """ info = det.get_cluster_info() if info is None: return _dummy_init(distributed=distributed, storage_manager=storage_manager) # We are on the cluster. cert = certs.default_load(info.master_url) session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config()) if distributed is None: if len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError("you must provide a valid DistributedContext for a multi-slot task") distributed = distributed or core.DummyDistributedContext() preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode) # At present, we only support tensorboards in Trial tasks. tbd_writer = None train = None searcher = None if info.task_type == "TRIAL": # Prepare the tensorboard hooks. tensorboard_manager = tensorboard.build( info.cluster_id, str(info.trial.experiment_id), str(info.trial.trial_id), info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) if tensorboard_mode == core.TensorboardMode.AUTO: tbd_writer = tensorboard.get_metric_writer() train = core.TrainContext( session, info.trial.trial_id, info.trial._trial_run_id, info.trial.experiment_id, distributed, tensorboard_mode, tensorboard_manager, tbd_writer, ) units = core._parse_searcher_units(info.trial._config) searcher = core.SearcherContext( session, distributed, info.trial.trial_id, info.trial._trial_run_id, info.allocation_id, units, ) if storage_manager is None: storage_manager = storage.build( info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) checkpoint = core.CheckpointContext( distributed, storage_manager, session, info.task_id, info.allocation_id, tensorboard_mode, tensorboard_manager, ) else: # TODO: support checkpointing for non-trial tasks. if storage_manager is None: base_path = appdirs.user_data_dir("determined") logger.info("no storage_manager provided; storing checkpoints in {base_path}") storage_manager = storage.SharedFSStorageManager(base_path) checkpoint = core.DummyCheckpointContext(distributed, storage_manager) _install_stacktrace_on_sigusr1() return Context( distributed=distributed, checkpoint=checkpoint, preempt=preempt, train=train, searcher=searcher, )
def test_unknown_type() -> None: config = {"type": "unknown"} with pytest.raises(TypeError, match="Unknown storage type: unknown"): storage.build(config, container_path=None)
def test_illegal_type() -> None: config = {"type": 4} with pytest.raises(CheckFailedError, match="must be a string"): storage.build(config, container_path=None)
def test_missing_type() -> None: with pytest.raises(CheckFailedError, match="Missing 'type' parameter"): storage.build({}, container_path=None)