def test_checkpoint_upload_failure(tmp_path: pathlib.Path) -> None: hparams = {"global_batch_size": 64} env = utils.make_default_env_context(hparams) rendezvous_info = utils.make_default_rendezvous_info() storage_manager = FailOnUploadStorageManager(str(tmp_path)) tensorboard_manager = NoopTensorboardManager() metric_writer = NoopBatchMetricWriter() def checkpoint_response_func(metrics: workload.Response) -> None: raise ValueError( "response_func should not be called if the upload fails") def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.checkpoint_workload(), [], checkpoint_response_func workload_manager = layers.build_workload_manager( env, make_workloads(), rendezvous_info, storage_manager, tensorboard_manager, metric_writer, ) trial_controller = NoopTrialController(iter(workload_manager)) # Iterate through the events in the workload_manager as the TrialController would. with pytest.raises(ValueError, match="upload error"): trial_controller.run()
def build_and_run_training_pipeline(env: det.EnvContext) -> None: # Create the socket manager. The socket manager will connect to the master and read messages # until it receives the rendezvous_info. # # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer. with layers.SocketManager(env) as socket_mgr: # Create the storage manager. This is used to download the initial checkpoint here in # build_training_pipeline and also used by the workload manager to create and store # checkpoints during training. storage_mgr = storage.build(env.experiment_config["checkpoint_storage"]) [tensorboard_mgr, tensorboard_writer] = load.prepare_tensorboard(env) # Create the workload manager. The workload manager will receive workloads from the # socket_mgr, and augment them with some additional arguments. Additionally, the # workload manager is responsible for some generic workload hooks for things like timing # workloads, preparing checkpoints, and uploading completed checkpoints. Finally, the # workload manager does some sanity checks on response messages that originate from the # trial. # # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task. workload_mgr = layers.build_workload_manager( env, iter(socket_mgr), socket_mgr.get_rendezvous_info(), storage_mgr, tensorboard_mgr, tensorboard_writer, ) hvd_config = horovod.HorovodContext.from_configs( env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams ) logging.info(f"Horovod config: {hvd_config.__dict__}.") # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access # to this checkpoint. with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path: # Horovod distributed training is done inside subprocesses. if hvd_config.use: subproc = layers.SubprocessLauncher( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config ) subproc.run() else: if env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) controller = load.prepare_controller( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config, ) controller.run()
def test_reject_nonscalar_searcher_metric() -> None: metric_name = "validation_error" hparams = {"global_batch_size": 64} experiment_config = utils.make_default_exp_config(hparams, 1) experiment_config["searcher"] = {"metric": metric_name} env = utils.make_default_env_context(hparams=hparams, experiment_config=experiment_config) rendezvous_info = utils.make_default_rendezvous_info() storage_manager = NoopStorageManager(os.devnull) tensorboard_manager = NoopTensorboardManager() metric_writer = NoopBatchMetricWriter() def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.validation_workload( ), [], workload.ignore_workload_response # Normal Python numbers and NumPy scalars are acceptable; other values are not. cases = [ (True, 17), (True, 0.17), (True, np.float64(0.17)), (True, np.float32(0.17)), (False, "foo"), (False, [0.17]), (False, {}), ] for is_valid, metric_value in cases: workload_manager = layers.build_workload_manager( env, make_workloads(), rendezvous_info, storage_manager, tensorboard_manager, metric_writer, ) trial_controller = NoopTrialController( iter(workload_manager), validation_metrics={metric_name: metric_value}) if is_valid: trial_controller.run() else: with pytest.raises(AssertionError, match="non-scalar"): trial_controller.run()