def test_illegal_type() -> None: checkpoint_config = {"type": 4} with pytest.raises(TypeError, match="must be a string"): env = test_util.get_dummy_env() tensorboard.build( env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config )
def test_unknown_type() -> None: checkpoint_config = { "type": "unknown", "host_path": HOST_PATH, } with pytest.raises(TypeError, match="Unknown storage type: unknown"): tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
def test_s3_build_missing_param() -> None: conf = copy.deepcopy(default_conf) del conf["bucket"] with pytest.raises(KeyError): env = test_util.get_dummy_env() tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, conf)
def test_invalid_prefix(monkeypatch: monkeypatch.MonkeyPatch) -> None: env = test_util.get_dummy_env() conf = copy.deepcopy(default_conf) conf["prefix"] = "my/invalid/../prefix" with pytest.raises(ValueError): tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, conf)
def test_unknown_type() -> None: checkpoint_config = { "type": "unknown", "host_path": HOST_PATH, } with pytest.raises(TypeError, match="Unknown storage type: unknown"): env = test_util.get_dummy_env() tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config)
def test_s3_build(prefix: Optional[str]) -> None: env = test_util.get_dummy_env() conf = copy.deepcopy(default_conf) conf["prefix"] = prefix manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, conf) assert isinstance(manager, tensorboard.S3TensorboardManager)
def test_getting_manager_instance(tmp_path: pathlib.Path) -> None: checkpoint_config = {"type": "shared_fs", "host_path": HOST_PATH} env = test_util.get_dummy_env() manager = tensorboard.build( env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config ) assert isinstance(manager, tensorboard.SharedFSTensorboardManager)
def prepare_tensorboard( env: det.EnvContext, container_path: Optional[str] = None, ) -> Tuple[tensorboard.TensorboardManager, tensorboard.BatchMetricWriter]: tensorboard_mgr = tensorboard.build( env.det_cluster_id, env.det_experiment_id, env.det_trial_id, env.experiment_config["checkpoint_storage"], container_path, ) try: from determined.tensorboard.metric_writers import tensorflow writer: tensorboard.MetricWriter = tensorflow.TFWriter() except ModuleNotFoundError: logging.warning("Tensorflow writer not found") from determined.tensorboard.metric_writers import pytorch writer = pytorch.TorchWriter() return ( tensorboard_mgr, tensorboard.BatchMetricWriter(writer), )
def test_getting_manager_instance(tmp_path: pathlib.Path) -> None: checkpoint_config = { "type": "shared_fs", "host_path": HOST_PATH, "container_path": tmp_path } manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config) assert isinstance(manager, SharedFSTensorboardManager)
def test_s3_faulty_lifecycle(monkeypatch: monkeypatch.MonkeyPatch) -> None: monkeypatch.setattr("boto3.client", s3.s3_faulty_client) env = test_util.get_dummy_env() manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, default_conf) with pytest.raises(exceptions.S3UploadFailedError): manager.sync()
def test_setting_optional_variable(tmp_path: pathlib.Path) -> None: checkpoint_config = { "type": "shared_fs", "base_path": "test_value", "host_path": HOST_PATH, } manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config) assert isinstance(manager, tensorboard.SharedFSTensorboardManager) assert manager.base_path == pathlib.Path("test_value/tensorboard")
def test_setting_storage_path(tmp_path: pathlib.Path) -> None: checkpoint_config = { "type": "shared_fs", "host_path": str(HOST_PATH), "storage_path": str(STORAGE_PATH), } manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config) assert isinstance(manager, tensorboard.SharedFSTensorboardManager) assert manager.storage_path == STORAGE_PATH
def test_s3_lifecycle(monkeypatch: monkeypatch.MonkeyPatch) -> None: monkeypatch.setattr("boto3.client", s3.s3_client) manager = tensorboard.build(test_util.get_dummy_env(), default_conf) assert isinstance(manager, tensorboard.S3TensorboardManager) manager.sync() expected = ( "s3_bucket", "uuid-123/tensorboard/experiment/1/trial/1/events.out.tfevents.example", ) assert expected in manager.client.objects
def test_list_nonexistent_directory(tmp_path: pathlib.Path) -> None: base_path = "/non-existent-directory" checkpoint_config = { "type": "shared_fs", "base_path": base_path, "host_path": HOST_PATH, "container_path": tmp_path, } manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config) assert not pathlib.Path(base_path).exists() assert manager.list_tfevents() == []
def test_list_directory(tmp_path: pathlib.Path) -> None: checkpoint_config = { "type": "shared_fs", "base_path": BASE_PATH, "host_path": HOST_PATH, "container_path": tmp_path, } manager = tensorboard.build(test_util.get_dummy_env(), checkpoint_config) full_event_path = BASE_PATH.joinpath("tensorboard", "events.out.tfevents.example") assert set(manager.list_tfevents()) == {full_event_path}
def test_build_with_container_path(tmp_path: pathlib.Path) -> None: checkpoint_config = { "type": "shared_fs", "host_path": str(HOST_PATH), "storage_path": str(STORAGE_PATH), } env = test_util.get_dummy_env() manager = tensorboard.build( env.det_cluster_id, env.det_experiment_id, env.det_trial_id, checkpoint_config, container_path=str(tmp_path), ) assert isinstance(manager, tensorboard.SharedFSTensorboardManager) assert manager.storage_path == tmp_path.joinpath("test_storage_path")
def prepare_tensorboard( env: det.EnvContext, ) -> Tuple[tensorboard.TensorboardManager, tensorboard.BatchMetricWriter]: tensorboard_mgr = tensorboard.build(env, env.experiment_config["checkpoint_storage"]) try: from determined.tensorboard.metric_writers import pytorch writer: tensorboard.MetricWriter = pytorch.TorchWriter() except ImportError: print("PYTORCH WRITER NOT FOUND") from determined.tensorboard.metric_writers import tensorflow writer = tensorflow.TFWriter() return ( tensorboard_mgr, tensorboard.BatchMetricWriter(writer, env.experiment_config.batches_per_step()), )
def test_s3_lifecycle(monkeypatch: monkeypatch.MonkeyPatch, prefix: Optional[str]) -> None: monkeypatch.setattr("boto3.client", s3.s3_client) env = test_util.get_dummy_env() conf = copy.deepcopy(default_conf) conf["prefix"] = prefix manager = tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, conf) assert isinstance(manager, tensorboard.S3TensorboardManager) tfevents_path = "uuid-123/tensorboard/experiment/1/trial/1/events.out.tfevents.example" manager.sync() if prefix is not None: tfevents_path = os.path.join( os.path.normpath(prefix).lstrip("/"), tfevents_path) expected = ( "s3_bucket", tfevents_path, ) assert expected in manager.client.objects
def test_missing_type() -> None: with pytest.raises(TypeError, match="Missing 'type' parameter"): tensorboard.build(test_util.get_dummy_env(), {})
def test_s3_build_missing_param() -> None: conf = copy.deepcopy(default_conf) del conf["bucket"] with pytest.raises(KeyError): tensorboard.build(test_util.get_dummy_env(), conf)
def test_missing_type() -> None: with pytest.raises(TypeError, match="Missing 'type' parameter"): env = test_util.get_dummy_env() tensorboard.build(env.det_cluster_id, env.det_experiment_id, env.det_trial_id, {})
def test_s3_build() -> None: manager = tensorboard.build(test_util.get_dummy_env(), default_conf) assert isinstance(manager, tensorboard.S3TensorboardManager)
def test_illegal_type() -> None: checkpoint_config = {"type": 4} with pytest.raises(TypeError, match="must be a string"): tensorboard.build(test_util.get_dummy_env(), checkpoint_config)
def main(argv: List[str]) -> None: parser = argparse.ArgumentParser(description="Determined checkpoint GC") parser.add_argument( "--version", action="version", version="Determined checkpoint GC, version {}".format(det.__version__), ) parser.add_argument("--experiment-id", help="The experiment ID to run the GC job for") parser.add_argument( "--log-level", default=os.getenv("DET_LOG_LEVEL", "INFO"), choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Set the logging level", ) parser.add_argument( "--storage-config", type=json_file_arg, default=os.getenv("DET_STORAGE_CONFIG", {}), help="Storage config (JSON-formatted file)", ) parser.add_argument( "--delete", type=json_file_arg, default=os.getenv("DET_DELETE", []), help="Checkpoints to delete (JSON-formatted file)", ) parser.add_argument( "--delete-tensorboards", action="store_true", default=os.getenv("DET_DELETE_TENSORBOARDS", False), help="Delete Tensorboards from storage", ) parser.add_argument( "--dry-run", action="store_true", default=("DET_DRY_RUN" in os.environ), help="Do not actually delete any checkpoints from storage", ) args = parser.parse_args(argv) logging.basicConfig( level=args.log_level, format="%(asctime)s:%(module)s:%(levelname)s: %(message)s" ) logging.info("Determined checkpoint GC, version {}".format(det.__version__)) storage_config = args.storage_config logging.info("Using checkpoint storage: {}".format(storage_config)) manager = storage.build(storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH) storage_ids = [c["uuid"] for c in args.delete["checkpoints"]] delete_checkpoints(manager, storage_ids, dry_run=args.dry_run) if args.delete_tensorboards: tb_manager = tensorboard.build( os.environ["DET_CLUSTER_ID"], args.experiment_id, None, storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH, ) delete_tensorboards(tb_manager, dry_run=args.dry_run)
def init( *, distributed: Optional[core.DistributedContext] = None, # TODO: figure out a better way to deal with checkpointing in the local training case. storage_manager: Optional[storage.StorageManager] = None, preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief, tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO, ) -> Context: """ ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core API. Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly. Certain components of the Core API may be configured by passing arguments to ``core.init()``. The only arg that is required is a ``DistributedContext``, and even that is only required for for multi-slot tasks. All of your training must occur within the scope of the ``with core.init() as core_context``, as there are resources necessary for training which start in the ``core.Context``'s ``__enter__`` method and must be cleaned up in its ``__exit__()`` method. Arguments: distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is required for multi-slot training, but unnecessary for single-slot training. Defaults to ``None``. preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the ``core_context.preempt.should_preempt()`` method. See :class:`~determined.core.PreemptMode` for more detail. Defaults to ``WorkersAskChief``. storage_manager: Internal use only. tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard metrics and profiling data are retained. See :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``. """ info = det.get_cluster_info() if info is None: return _dummy_init(distributed=distributed, storage_manager=storage_manager) # We are on the cluster. cert = certs.default_load(info.master_url) session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config()) if distributed is None: if len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError("you must provide a valid DistributedContext for a multi-slot task") distributed = distributed or core.DummyDistributedContext() preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode) # At present, we only support tensorboards in Trial tasks. tbd_writer = None train = None searcher = None if info.task_type == "TRIAL": # Prepare the tensorboard hooks. tensorboard_manager = tensorboard.build( info.cluster_id, str(info.trial.experiment_id), str(info.trial.trial_id), info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) if tensorboard_mode == core.TensorboardMode.AUTO: tbd_writer = tensorboard.get_metric_writer() train = core.TrainContext( session, info.trial.trial_id, info.trial._trial_run_id, info.trial.experiment_id, distributed, tensorboard_mode, tensorboard_manager, tbd_writer, ) units = core._parse_searcher_units(info.trial._config) searcher = core.SearcherContext( session, distributed, info.trial.trial_id, info.trial._trial_run_id, info.allocation_id, units, ) if storage_manager is None: storage_manager = storage.build( info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) checkpoint = core.CheckpointContext( distributed, storage_manager, session, info.task_id, info.allocation_id, tensorboard_mode, tensorboard_manager, ) else: # TODO: support checkpointing for non-trial tasks. if storage_manager is None: base_path = appdirs.user_data_dir("determined") logger.info("no storage_manager provided; storing checkpoints in {base_path}") storage_manager = storage.SharedFSStorageManager(base_path) checkpoint = core.DummyCheckpointContext(distributed, storage_manager) _install_stacktrace_on_sigusr1() return Context( distributed=distributed, checkpoint=checkpoint, preempt=preempt, train=train, searcher=searcher, )