Python build Examples, determined.common.storage.build Python Examples

Example #1

0

Show file

def test_build_with_container_path() -> None:
    config = {
        "type": "shared_fs",
        "host_path": "/host_path",
        "storage_path": "storage_path"
    }
    manager = storage.build(config, container_path=None)
    assert manager._base_path == "/host_path/storage_path"
    manager = storage.build(config, container_path="/container_path")
    assert manager._base_path == "/container_path/storage_path"

Example #2

0

Show file

def main(argv: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Determined checkpoint GC")

    parser.add_argument(
        "--version",
        action="version",
        version="Determined checkpoint GC, version {}".format(det.__version__),
    )
    parser.add_argument("--experiment-id", help="The experiment ID to run the GC job for")
    parser.add_argument(
        "--log-level",
        default=os.getenv("DET_LOG_LEVEL", "INFO"),
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Set the logging level",
    )
    parser.add_argument(
        "--storage-config",
        type=json_file_arg,
        default=os.getenv("DET_STORAGE_CONFIG", {}),
        help="Storage config (JSON-formatted file)",
    )
    parser.add_argument(
        "--delete",
        type=json_file_arg,
        default=os.getenv("DET_DELETE", []),
        help="Checkpoints to delete (JSON-formatted file)",
    )
    parser.add_argument(
        "--delete-tensorboards",
        action="store_true",
        default=os.getenv("DET_DELETE_TENSORBOARDS", False),
        help="Delete Tensorboards from storage",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        default=("DET_DRY_RUN" in os.environ),
        help="Do not actually delete any checkpoints from storage",
    )

    args = parser.parse_args(argv)

    logging.basicConfig(
        level=args.log_level, format="%(asctime)s:%(module)s:%(levelname)s: %(message)s"
    )

    logging.info("Determined checkpoint GC, version {}".format(det.__version__))

    storage_config = args.storage_config
    logging.info("Using checkpoint storage: {}".format(storage_config))

    manager = storage.build(storage_config, container_path=constants.SHARED_FS_CONTAINER_PATH)

    storage_ids = [c["uuid"] for c in args.delete["checkpoints"]]

    delete_checkpoints(manager, storage_ids, dry_run=args.dry_run)

    if args.delete_tensorboards:
        tb_manager = tensorboard.build(
            os.environ["DET_CLUSTER_ID"],
            args.experiment_id,
            None,
            storage_config,
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
        delete_tensorboards(tb_manager, dry_run=args.dry_run)

Example #3

0

Show file

File: harness.py Project: eecsliu/determined

def build_and_run_training_pipeline(env: det.EnvContext) -> None:

    # Create the socket manager. The socket manager will connect to the master and read messages
    # until it receives the rendezvous_info.
    #
    # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer.
    with layers.SocketManager(env) as socket_mgr:

        # Create the storage manager. This is used to download the initial checkpoint here in
        # build_training_pipeline and also used by the workload manager to create and store
        # checkpoints during training.
        storage_mgr = storage.build(
            env.experiment_config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )

        [tensorboard_mgr, tensorboard_writer
         ] = load.prepare_tensorboard(env, constants.SHARED_FS_CONTAINER_PATH)

        # Create the workload manager. The workload manager will receive workloads from the
        # socket_mgr, and augment them with some additional arguments. Additionally, the
        # workload manager is responsible for some generic workload hooks for things like timing
        # workloads, preparing checkpoints, and uploading completed checkpoints.  Finally, the
        # workload manager does some sanity checks on response messages that originate from the
        # trial.
        #
        # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task.
        workload_mgr = layers.build_workload_manager(
            env,
            iter(socket_mgr),
            socket_mgr.get_rendezvous_info(),
            storage_mgr,
            tensorboard_mgr,
            tensorboard_writer,
        )

        workloads = iter(workload_mgr)
        hvd_config = horovod.HorovodContext.from_configs(
            env.experiment_config, socket_mgr.get_rendezvous_info(),
            env.hparams)
        logging.info(f"Horovod config: {hvd_config.__dict__}.")

        # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access
        # to this checkpoint.
        with maybe_load_checkpoint(storage_mgr,
                                   env.latest_checkpoint) as load_path:

            # Horovod distributed training is done inside subprocesses.
            if hvd_config.use:
                subproc = layers.SubprocessLauncher(
                    env, workloads, load_path,
                    socket_mgr.get_rendezvous_info(), hvd_config)
                subproc.run()
            else:
                if env.experiment_config.debug_enabled():
                    faulthandler.dump_traceback_later(30, repeat=True)

                with det._catch_sys_exit():
                    with det._catch_init_invalid_hp(workloads):
                        controller = load.prepare_controller(
                            env,
                            workloads,
                            load_path,
                            socket_mgr.get_rendezvous_info(),
                            hvd_config,
                        )
                    controller.run()

Example #4

0

Show file

    def download(self, path: Optional[str] = None) -> str:
        """
        Download checkpoint to local storage.

        Arguments:
            path (string, optional): Top level directory to place the
                checkpoint under. If this parameter is not set, the checkpoint will
                be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the
                current working directory.
        """
        if path is not None:
            local_ckpt_dir = pathlib.Path(path)
        else:
            local_ckpt_dir = pathlib.Path("checkpoints", self.uuid)

        # Backward compatibility: we used MLflow's MLmodel checkpoint format for
        # serializing pytorch models. We now use our own format that contains a
        # metadata.json file. We are checking for checkpoint existence by
        # looking for both checkpoint formats in the output directory.
        potential_metadata_paths = [
            local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"]
        ]
        if not any(p.exists() for p in potential_metadata_paths):
            # If the target directory doesn't already appear to contain a
            # checkpoint, attempt to fetch one.
            if self.experiment_config["checkpoint_storage"][
                    "type"] == "shared_fs":
                src_ckpt_dir = self._find_shared_fs_path()
                shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir))
            else:
                local_ckpt_dir.mkdir(parents=True, exist_ok=True)
                manager = storage.build(
                    self.experiment_config["checkpoint_storage"],
                    container_path=None,
                )
                if not isinstance(
                        manager,
                    (storage.S3StorageManager, storage.GCSStorageManager)):
                    raise AssertionError(
                        "Downloading from S3 or GCS requires the experiment to be configured with "
                        "S3 or GCS checkpointing, {} found instead".format(
                            self.experiment_config["checkpoint_storage"]
                            ["type"]))

                metadata = storage.StorageMetadata.from_json({
                    "uuid":
                    self.uuid,
                    "resources":
                    self.resources
                })
                manager.download(metadata, str(local_ckpt_dir))

        if not local_ckpt_dir.joinpath("metadata.json").exists():
            with open(local_ckpt_dir.joinpath("metadata.json"), "w") as f:
                json.dump(
                    {
                        "determined_version": self.determined_version,
                        "framework": self.framework,
                        "format": self.format,
                        "experiment_id": self.experiment_id,
                        "trial_id": self.trial_id,
                        "hparams": self.hparams,
                        "experiment_config": self.experiment_config,
                        "metadata": self.metadata,
                    },
                    f,
                    indent=2,
                )

        return str(local_ckpt_dir)

Example #5

0

Show file

File: _checkpoint.py Project: justin-determined-ai/determined

    def download(self, path: Optional[str] = None) -> str:
        """
        Download checkpoint to local storage.

        See also:

          - :func:`determined.pytorch.load_trial_from_checkpoint_path`
          - :func:`determined.keras.load_model_from_checkpoint_path`
          - :func:`determined.estimator.load_estimator_from_checkpoint_path`

        Arguments:
            path (string, optional): Top level directory to place the
                checkpoint under. If this parameter is not set, the checkpoint will
                be downloaded to ``checkpoints/<checkpoint_uuid>`` relative to the
                current working directory.
        """
        if path is not None:
            local_ckpt_dir = pathlib.Path(path)
        else:
            local_ckpt_dir = pathlib.Path("checkpoints", self.uuid)

        # Backward compatibility: we used MLflow's MLmodel checkpoint format for
        # serializing pytorch models. We now use our own format that contains a
        # metadata.json file. We are checking for checkpoint existence by
        # looking for both checkpoint formats in the output directory.
        potential_metadata_paths = [
            local_ckpt_dir.joinpath(f) for f in ["metadata.json", "MLmodel"]
        ]
        if not any(p.exists() for p in potential_metadata_paths):
            # If the target directory doesn't already appear to contain a
            # checkpoint, attempt to fetch one.
            if self.training is None:
                raise NotImplementedError(
                    "Non-training checkpoints cannot be downloaded")

            checkpoint_storage = self.training.experiment_config[
                "checkpoint_storage"]
            if checkpoint_storage["type"] == "shared_fs":
                src_ckpt_dir = self._find_shared_fs_path(checkpoint_storage)
                shutil.copytree(str(src_ckpt_dir), str(local_ckpt_dir))
            else:
                local_ckpt_dir.mkdir(parents=True, exist_ok=True)
                manager = storage.build(
                    checkpoint_storage,
                    container_path=None,
                )
                if not isinstance(
                        manager,
                    (
                        storage.S3StorageManager,
                        storage.GCSStorageManager,
                        storage.AzureStorageManager,
                    ),
                ):
                    raise AssertionError(
                        "Downloading from Azure, S3 or GCS requires the experiment to be "
                        "configured with Azure, S3 or GCS checkpointing, {} found instead"
                        .format(checkpoint_storage["type"]))

                manager.download(self.uuid, str(local_ckpt_dir))

        # As of v0.18.0, we write metadata.json once at upload time.  Checkpoints uploaded prior to
        # 0.18.0 will not have a metadata.json present.  Unfortunately, checkpoints earlier than
        # 0.17.7 depended on this file existing in order to be loaded.  Therefore, when we detect
        # that the metadata.json file is not present, we write it to make sure those checkpoints can
        # still load.
        metadata_path = local_ckpt_dir.joinpath("metadata.json")
        if not metadata_path.exists():
            self.write_metadata_file(str(metadata_path))

        return str(local_ckpt_dir)

Example #6

0

Show file

File: test_checkpoints.py Project: determined-ai/determined

def test_delete_checkpoints() -> None:
    base_conf_path = conf.fixtures_path("no_op/single-default-ckpt.yaml")

    config = conf.load_config(str(base_conf_path))
    config["checkpoint_storage"] = {
        "type": "shared_fs",
        "host_path": "/tmp",
        "storage_path": "delete-checkpoints-e2etest",
        "save_trial_latest": 10,
    }
    config["min_checkpoint_period"] = {"batches": 10}

    exp_id_1 = exp.run_basic_test_with_temp_config(
        config, model_def_path=conf.fixtures_path("no_op"), expected_trials=1)

    exp_id_2 = exp.run_basic_test_with_temp_config(
        config, model_def_path=conf.fixtures_path("no_op"), expected_trials=1)

    wait_for_gc_to_finish(exp_id_1)
    wait_for_gc_to_finish(exp_id_2)

    test_session = exp.determined_test_session()
    exp_1_checkpoints = bindings.get_GetExperimentCheckpoints(
        session=test_session, id=exp_id_1).checkpoints
    exp_2_checkpoints = bindings.get_GetExperimentCheckpoints(
        session=test_session, id=exp_id_1).checkpoints
    assert len(exp_1_checkpoints
               ) > 0, f"no checkpoints found in experiment with ID:{exp_id_1}"
    assert len(exp_2_checkpoints
               ) > 0, f"no checkpoints found in experiment with ID:{exp_id_2}"

    d_exp_1_checkpoint_uuids = [
        exp_1_checkpoints[d_index].uuid
        for d_index in random.sample(range(len(exp_1_checkpoints)), 2)
    ]
    d_exp_2_checkpoint_uuids = [
        exp_2_checkpoints[d_index].uuid
        for d_index in random.sample(range(len(exp_2_checkpoints)), 2)
    ]

    d_checkpoint_uuids = d_exp_1_checkpoint_uuids + d_exp_2_checkpoint_uuids
    print(f"checkpoints uuids to be deleteted: {d_checkpoint_uuids}")
    # ensure checkpoint directories exist:
    checkpoint_config = config["checkpoint_storage"]
    storage_manager = storage.build(checkpoint_config, container_path=None)

    for uuid in d_checkpoint_uuids:
        try:
            storage_manager.restore_path(uuid)
        except errors.CheckpointNotFound:
            pytest.fail(
                f"checkpoint directory with uuid: {uuid} was not created.")

    delete_body = bindings.v1DeleteCheckpointsRequest(
        checkpointUuids=d_checkpoint_uuids)
    bindings.delete_DeleteCheckpoints(session=test_session, body=delete_body)

    wait_for_gc_to_finish(exp_id_1)
    wait_for_gc_to_finish(exp_id_2)

    for d_c in d_checkpoint_uuids:
        ensure_checkpoint_deleted(test_session, d_c, storage_manager)

Example #7

0

Show file

File: test_checkpoints.py Project: determined-ai/determined

def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                (bindings.determinedexperimentv1State.STATE_COMPLETED.value):
                {800, 900, 1000},
                (bindings.determinedexperimentv1State.STATE_DELETED.value): {
                    100,
                    200,
                    300,
                    400,
                    500,
                    600,
                    700,
                },
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                (bindings.determinedexperimentv1State.STATE_COMPLETED.value): {
                    100,
                    200,
                    300,
                    900,
                    1000,
                },
                (bindings.determinedexperimentv1State.STATE_DELETED.value): {
                    400,
                    500,
                    600,
                    700,
                    800,
                },
            },
        ),
    ]

    all_checkpoints: List[Tuple[Any, List[bindings.v1CheckpointWorkload]]] = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(
            experiment_id,
            bindings.determinedexperimentv1State.STATE_COMPLETED)

        # In some configurations, checkpoint GC will run on an auxillary machine, which may have to
        # be spun up still.  So we'll wait for it to run.
        wait_for_gc_to_finish(experiment_id)

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            cpoints = exp.workloads_with_checkpoint(trials[0].workloads)
            sorted_checkpoints = sorted(
                cpoints,
                key=lambda ckp: int(ckp.totalBatches),
            )
            assert len(sorted_checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for ckpt in sorted_checkpoints:
                by_state.setdefault(ckpt.state.value,
                                    set()).add(ckpt.totalBatches)

            if by_state == result:
                all_checkpoints.append((config, sorted_checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for i in range(max_checks):
        time.sleep(1)
        try:
            storage_states = []
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                storage_manager = storage.build(checkpoint_config,
                                                container_path=None)
                storage_state = {}  # type: Dict[str, Any]
                for checkpoint in checkpoints:
                    assert checkpoint.uuid is not None
                    storage_id = checkpoint.uuid
                    storage_state[storage_id] = {}
                    if checkpoint.state == bindings.determinedcheckpointv1State.STATE_COMPLETED:
                        storage_state[storage_id]["found"] = False
                        try:
                            with storage_manager.restore_path(storage_id):
                                storage_state[storage_id]["found"] = True
                        except errors.CheckpointNotFound:
                            pass
                    elif checkpoint.state == bindings.determinedcheckpointv1State.STATE_DELETED:
                        storage_state[storage_id] = {
                            "deleted": False,
                            "checkpoint": checkpoint
                        }
                        try:
                            with storage_manager.restore_path(storage_id):
                                pass
                        except errors.CheckpointNotFound:
                            storage_state[storage_id]["deleted"] = True
                        storage_states.append(storage_state)

            for storage_state in storage_states:
                for state in storage_state.values():
                    if state.get("deleted", None) is False:
                        json_states = json.dumps(storage_states)
                        raise AssertionError(
                            f"Some checkpoints were not deleted: JSON:{json_states}"
                        )
                    if state.get("found", None) is False:
                        json_states = json.dumps(storage_states)
                        raise AssertionError(
                            f"Some checkpoints were not found: JSON:{json_states}"
                        )
        except AssertionError:
            if i == max_checks - 1:
                raise
        else:
            break

Example #8

0

Show file

File: test_system.py Project: shiyuann/determined

def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                "COMPLETED": {800, 900, 1000},
                "DELETED": {100, 200, 300, 400, 500, 600, 700}
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                "COMPLETED": {100, 200, 300, 900, 1000},
                "DELETED": {400, 500, 600, 700, 800}
            },
        ),
    ]

    all_checkpoints = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(experiment_id, "COMPLETED")

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            checkpoints = sorted(
                (step["checkpoint"] for step in trials[0]["steps"]),
                key=operator.itemgetter("total_batches"),
            )
            assert len(checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for checkpoint in checkpoints:
                by_state.setdefault(checkpoint["state"],
                                    set()).add(checkpoint["total_batches"])

            if by_state == result:
                all_checkpoints.append((config, checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for i in range(max_checks):
        time.sleep(1)
        try:
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                if checkpoint_config["type"] == "shared_fs":
                    deleted_exception = check.CheckFailedError
                elif checkpoint_config["type"] == "s3":
                    deleted_exception = botocore.exceptions.ClientError
                else:
                    raise NotImplementedError(
                        f'unsupported storage type {checkpoint_config["type"]}'
                    )

                storage_manager = storage.build(checkpoint_config,
                                                container_path=None)
                for checkpoint in checkpoints:
                    metadata = storage.StorageMetadata.from_json(checkpoint)
                    if checkpoint["state"] == "COMPLETED":
                        with storage_manager.restore_path(metadata):
                            pass
                    elif checkpoint["state"] == "DELETED":
                        try:
                            with storage_manager.restore_path(metadata):
                                raise AssertionError("checkpoint not deleted")
                        except deleted_exception:
                            pass
        except AssertionError:
            if i == max_checks - 1:
                raise
        else:
            break

Example #9

0

Show file

File: _context.py Project: determined-ai/determined

def init(
    *,
    distributed: Optional[core.DistributedContext] = None,
    # TODO: figure out a better way to deal with checkpointing in the local training case.
    storage_manager: Optional[storage.StorageManager] = None,
    preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief,
    tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO,
) -> Context:
    """
    ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core
    API.

    Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly.
    Certain components of the Core API may be configured by passing arguments to ``core.init()``.
    The only arg that is required is a ``DistributedContext``, and even that is only required for
    for multi-slot tasks.

    All of your training must occur within the scope of the ``with core.init() as core_context``, as
    there are resources necessary for training which start in the ``core.Context``'s ``__enter__``
    method and must be cleaned up in its ``__exit__()`` method.

    Arguments:
        distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is
            required for multi-slot training, but unnecessary for single-slot training.  Defaults to
            ``None``.
        preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the
            ``core_context.preempt.should_preempt()`` method.  See
            :class:`~determined.core.PreemptMode` for more detail.  Defaults to ``WorkersAskChief``.
        storage_manager: Internal use only.
        tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard
            metrics and profiling data are retained. See
            :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``.
    """
    info = det.get_cluster_info()
    if info is None:
        return _dummy_init(distributed=distributed, storage_manager=storage_manager)

    # We are on the cluster.
    cert = certs.default_load(info.master_url)
    session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config())

    if distributed is None:
        if len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError("you must provide a valid DistributedContext for a multi-slot task")

    distributed = distributed or core.DummyDistributedContext()

    preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode)

    # At present, we only support tensorboards in Trial tasks.
    tbd_writer = None

    train = None
    searcher = None

    if info.task_type == "TRIAL":
        # Prepare the tensorboard hooks.
        tensorboard_manager = tensorboard.build(
            info.cluster_id,
            str(info.trial.experiment_id),
            str(info.trial.trial_id),
            info.trial._config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
        if tensorboard_mode == core.TensorboardMode.AUTO:
            tbd_writer = tensorboard.get_metric_writer()

        train = core.TrainContext(
            session,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.trial.experiment_id,
            distributed,
            tensorboard_mode,
            tensorboard_manager,
            tbd_writer,
        )
        units = core._parse_searcher_units(info.trial._config)
        searcher = core.SearcherContext(
            session,
            distributed,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.allocation_id,
            units,
        )

        if storage_manager is None:
            storage_manager = storage.build(
                info.trial._config["checkpoint_storage"],
                container_path=constants.SHARED_FS_CONTAINER_PATH,
            )

        checkpoint = core.CheckpointContext(
            distributed,
            storage_manager,
            session,
            info.task_id,
            info.allocation_id,
            tensorboard_mode,
            tensorboard_manager,
        )

    else:
        # TODO: support checkpointing for non-trial tasks.
        if storage_manager is None:
            base_path = appdirs.user_data_dir("determined")
            logger.info("no storage_manager provided; storing checkpoints in {base_path}")
            storage_manager = storage.SharedFSStorageManager(base_path)
        checkpoint = core.DummyCheckpointContext(distributed, storage_manager)

    _install_stacktrace_on_sigusr1()

    return Context(
        distributed=distributed,
        checkpoint=checkpoint,
        preempt=preempt,
        train=train,
        searcher=searcher,
    )

Example #10

0

Show file

def test_unknown_type() -> None:
    config = {"type": "unknown"}
    with pytest.raises(TypeError, match="Unknown storage type: unknown"):
        storage.build(config, container_path=None)

Example #11

0

Show file

def test_illegal_type() -> None:
    config = {"type": 4}
    with pytest.raises(CheckFailedError, match="must be a string"):
        storage.build(config, container_path=None)

Example #12

0

Show file

def test_missing_type() -> None:
    with pytest.raises(CheckFailedError, match="Missing 'type' parameter"):
        storage.build({}, container_path=None)