Beispiel #1
0
def test_checkpoint_upload_failure(tmp_path: pathlib.Path) -> None:
    hparams = {"global_batch_size": 64}
    env = utils.make_default_env_context(hparams)
    rendezvous_info = utils.make_default_rendezvous_info()
    storage_manager = FailOnUploadStorageManager(str(tmp_path))
    tensorboard_manager = NoopTensorboardManager()
    metric_writer = NoopBatchMetricWriter()

    def checkpoint_response_func(metrics: workload.Response) -> None:
        raise ValueError(
            "response_func should not be called if the upload fails")

    def make_workloads() -> workload.Stream:
        yield workload.train_workload(
            1, num_batches=100), [], workload.ignore_workload_response
        yield workload.checkpoint_workload(), [], checkpoint_response_func

    workload_manager = layers.build_workload_manager(
        env,
        make_workloads(),
        rendezvous_info,
        storage_manager,
        tensorboard_manager,
        metric_writer,
    )

    trial_controller = NoopTrialController(iter(workload_manager))

    # Iterate through the events in the workload_manager as the TrialController would.
    with pytest.raises(ValueError, match="upload error"):
        trial_controller.run()
Beispiel #2
0
def test_reject_nonscalar_searcher_metric() -> None:
    metric_name = "validation_error"

    hparams = {"global_batch_size": 64}
    experiment_config = utils.make_default_exp_config(hparams, 1)
    experiment_config["searcher"] = {"metric": metric_name}
    env = utils.make_default_env_context(hparams=hparams,
                                         experiment_config=experiment_config)
    rendezvous_info = utils.make_default_rendezvous_info()
    storage_manager = NoopStorageManager(os.devnull)
    tensorboard_manager = NoopTensorboardManager()
    metric_writer = NoopBatchMetricWriter()

    def make_workloads() -> workload.Stream:
        yield workload.train_workload(
            1, num_batches=100), [], workload.ignore_workload_response
        yield workload.validation_workload(
        ), [], workload.ignore_workload_response

    # Normal Python numbers and NumPy scalars are acceptable; other values are not.
    cases = [
        (True, 17),
        (True, 0.17),
        (True, np.float64(0.17)),
        (True, np.float32(0.17)),
        (False, "foo"),
        (False, [0.17]),
        (False, {}),
    ]
    for is_valid, metric_value in cases:
        workload_manager = layers.build_workload_manager(
            env,
            make_workloads(),
            rendezvous_info,
            storage_manager,
            tensorboard_manager,
            metric_writer,
        )

        trial_controller = NoopTrialController(
            iter(workload_manager),
            validation_metrics={metric_name: metric_value})
        if is_valid:
            trial_controller.run()
        else:
            with pytest.raises(AssertionError, match="non-scalar"):
                trial_controller.run()
Beispiel #3
0
def test_subprocess_launcher_receiver() -> None:
    env = utils.make_default_env_context(hparams={"global_batch_size": 1})
    rendezvous_info = utils.make_default_rendezvous_info()
    hvd_config = utils.make_default_hvd_config()

    def make_workloads() -> workload.Stream:
        interceptor = workload.WorkloadResponseInterceptor()
        for i, wkld in enumerate(fake_subprocess_receiver.fake_workload_gen()):
            yield from interceptor.send(wkld, [])
            assert interceptor.metrics_result() == {"count": i}

    subproc = layers.SubprocessLauncher(
        env=env,
        workloads=make_workloads(),
        load_path=None,
        rendezvous_info=rendezvous_info,
        hvd_config=hvd_config,
        python_subprocess_entrypoint="tests.fixtures.fake_subprocess_receiver",
    )
    subproc.run()