def test_one_var_training(self, test_checkpointing, tmp_path): checkpoint_dir = tmp_path.joinpath("checkpoint") # In the test_checkpointing case, we will call make_workloads() twice but batches and w # will persist across both calls. batches = enumerate([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]) w = 0.0 trial_class = tf_keras_one_var_model.OneVarTrial def make_workloads() -> workload.Stream: nonlocal w interceptor = workload.WorkloadResponseInterceptor() for idx, batch in batches: yield from interceptor.send(workload.train_workload(1), []) metrics = interceptor.metrics_result() # Calculate what the loss should be. loss = trial_class.calc_loss(w, batch) epsilon = 0.0001 assert abs(metrics["metrics"]["avg_metrics"]["loss"] - loss) < epsilon # Update what the weight should be. w = w - hparams["learning_rate"] * trial_class.calc_gradient(w, batch) if test_checkpointing and idx == 3: # Checkpoint and let the next TrialController finish the work.l yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response break yield workload.terminate_workload(), [], workload.ignore_workload_response hparams = {"learning_rate": 0.001, "global_batch_size": 3, "dataset_range": 10} exp_config = utils.make_default_exp_config(hparams, scheduling_unit=100) exp_config["records_per_epoch"] = 100 # TODO(DET-2436): Add a unit test for native implementation with tf dataset. controller = utils.make_trial_controller_from_trial_implementation( trial_class, hparams, make_workloads(), exp_config=exp_config, trial_seed=self.trial_seed, ) controller.run() # In the checkpointing case, we need to create another controller to finish training. if test_checkpointing: controller = utils.make_trial_controller_from_trial_implementation( trial_class, hparams, make_workloads(), exp_config=exp_config, load_path=checkpoint_dir, trial_seed=self.trial_seed, ) controller.run()
def test_custom_hook(self, tmp_path: Path) -> None: def make_workloads(checkpoint_dir: pathlib.Path) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=5, batches_per_step=5) yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload(), [], workload.ignore_workload_response def verify_callback(checkpoint_dir: pathlib.Path, checkpoint_num: int) -> None: with open(str(checkpoint_dir.joinpath("custom.log")), "r") as fp: assert int(fp.readline()) == checkpoint_num checkpoint_dir1 = tmp_path.joinpath("checkpoint1") controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialWithCustomHook, hparams=self.hparams, workloads=make_workloads(checkpoint_dir=checkpoint_dir1), batches_per_step=5, ) controller.run() verify_callback(checkpoint_dir=checkpoint_dir1, checkpoint_num=1) checkpoint_dir2 = tmp_path.joinpath("checkpoint2") controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialWithCustomHook, hparams=self.hparams, workloads=make_workloads(checkpoint_dir=checkpoint_dir2), batches_per_step=5, load_path=checkpoint_dir1, ) controller.run() verify_callback(checkpoint_dir=checkpoint_dir2, checkpoint_num=2)
def test_custom_eval(self) -> None: training_metrics = {} validation_metrics = {} def make_workloads(tag: str) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=900, validation_freq=100) tm, vm = trainer.result() training_metrics[tag] = tm validation_metrics[tag] = vm yield workload.terminate_workload(), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrial, hparams=self.hparams, workloads=make_workloads("A"), trial_seed=self.trial_seed, ) controller.run() controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialCustomEval, hparams=self.hparams, workloads=make_workloads("B"), trial_seed=self.trial_seed, ) controller.run() for original, custom_eval in zip(training_metrics["A"], training_metrics["B"]): assert original["loss"] == custom_eval["loss"] for original, custom_eval in zip(validation_metrics["A"], validation_metrics["B"]): assert original["loss"] == custom_eval["loss"]
def test_callbacks(self, tmp_path: pathlib.Path) -> None: checkpoint_dir = tmp_path.joinpath("checkpoint") controller = None # type: ignore def make_workloads1() -> workload.Stream: nonlocal controller yield workload.train_workload(1, 1, 0), [], workload.ignore_workload_response assert controller is not None, "controller was never set!" assert controller.trial.counter.__dict__ == { "validation_steps_started": 0, "validation_steps_ended": 0, "checkpoints_ended": 0, } yield workload.validation_workload(), [], workload.ignore_workload_response assert controller.trial.counter.__dict__ == { "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, } yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response assert controller.trial.counter.__dict__ == { "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 1, } yield workload.terminate_workload(), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialCallbacks, hparams=self.hparams, workloads=make_workloads1(), ) controller.run() # Verify the checkpoint loading callback works. def make_workloads2() -> workload.Stream: yield workload.terminate_workload(), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialCallbacks, hparams=self.hparams, workloads=make_workloads2(), load_path=checkpoint_dir, ) controller.run() assert controller.trial.counter.__dict__ == { "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, }
def test_grad_clipping(self) -> None: training_metrics = {} validation_metrics = {} def make_workloads(tag: str) -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1000, validation_freq=100) tm, vm = trainer.result() training_metrics[tag] = tm validation_metrics[tag] = vm yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialGradClipping, hparams=self.hparams, workloads=make_workloads("original"), trial_seed=self.trial_seed, ) controller.run() updated_hparams = {"gradient_clipping_l2_norm": 0.0001, **self.hparams} controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialGradClipping, hparams=updated_hparams, workloads=make_workloads("clipped_by_norm"), trial_seed=self.trial_seed, ) controller.run() for idx, (original, clipped) in enumerate( zip(training_metrics["original"], training_metrics["clipped_by_norm"])): if idx < 10: continue assert original["loss"] != clipped["loss"] updated_hparams = {"gradient_clipping_value": 0.0001, **self.hparams} controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialGradClipping, hparams=updated_hparams, workloads=make_workloads("clipped_by_val"), trial_seed=self.trial_seed, ) controller.run() for idx, (original, clipped) in enumerate( zip(training_metrics["original"], training_metrics["clipped_by_val"])): if idx < 10: continue assert original["loss"] != clipped["loss"]
def test_checkpointing(self, tmp_path: pathlib.Path) -> None: checkpoint_dir = tmp_path.joinpath("checkpoint") old_error = -1 def make_workloads_1() -> workload.Stream: nonlocal old_error trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10) training_metrics, validation_metrics = trainer.result() old_error = validation_metrics[-1]["binary_error"] yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=self.hparams, workloads=make_workloads_1(), trial_seed=self.trial_seed, ) controller.run() # Restore the checkpoint on a new trial instance and recompute # validation. The validation error should be the same as it was # previously. def make_workloads_2() -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.validation_workload(), []) metrics = interceptor.metrics_result() new_error = metrics["metrics"]["validation_metrics"][ "binary_error"] assert new_error == pytest.approx(old_error) yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=self.hparams, workloads=make_workloads_2(), load_path=checkpoint_dir, trial_seed=self.trial_seed, ) controller.run()
def test_restore_invalid_checkpoint(self, tmp_path: pathlib.Path) -> None: # Build, train, and save a checkpoint with the normal hyperparameters. checkpoint_dir = str(tmp_path.joinpath("checkpoint")) latest_checkpoint = None steps_completed = 0 def make_workloads_1() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=1, validation_freq=1, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = trainer.get_steps_completed() controller1 = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearDeepSpeedTrial, hparams=self.hparams, workloads=make_workloads_1(), trial_seed=self.trial_seed, checkpoint_dir=checkpoint_dir, expose_gpus=True, ) controller1.run() # Verify that an invalid architecture fails to load from the checkpoint. def make_workloads_2() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=1, validation_freq=1, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) with pytest.raises(AssertionError, match="Failed to load deepspeed checkpoint."): controller2 = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearTwoEngineTrial, hparams=self.hparams, workloads=make_workloads_2(), trial_seed=self.trial_seed, checkpoint_dir=checkpoint_dir, latest_checkpoint=latest_checkpoint, steps_completed=steps_completed, expose_gpus=True, ) controller2.run()
def test_callbacks(self, tmp_path: pathlib.Path) -> None: checkpoint_dir = tmp_path.joinpath("checkpoint") controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialCallbacks, hparams=self.hparams, workloads=[]) controller._train_for_step(1, 1) assert controller.trial.counter.__dict__ == { "train_steps_started": 1, "train_steps_ended": 1, "validation_steps_started": 0, "validation_steps_ended": 0, "checkpoints_ended": 0, } controller._compute_validation_metrics() assert controller.trial.counter.__dict__ == { "train_steps_started": 1, "train_steps_ended": 1, "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, } controller._save(checkpoint_dir) assert controller.trial.counter.__dict__ == { "train_steps_started": 1, "train_steps_ended": 1, "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 1, } del controller controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialCallbacks, hparams=self.hparams, workloads=[], load_path=checkpoint_dir, ) controller._load() assert controller.trial.counter.__dict__ == { "train_steps_started": 1, "train_steps_ended": 1, "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, }
def test_reject_named_dict_metric(self) -> None: # If at some point in the future the webui is able to render scalar metrics inside of # nested dictionary metrics, this test could go away. def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=1) yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_onevar_model.OneVarTrial, hparams=self.hparams, workloads=make_workloads(), trial_seed=self.trial_seed, ) def reducer_fn(_): return {"my_metric": 1.0} # Inject a named metric which returns a dict (which is not allowed). controller.context.wrap_reducer(reducer_fn, name="my_metric") with pytest.raises( AssertionError, match="with name set but it returned a dict anyway"): controller.run()
def test_fail_dataset_repro_check(self) -> None: updated_hparams = copy.deepcopy(self.hparams) updated_hparams["test_fail_dataset_repro_check"] = True def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=10, validation_freq=10, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "loss" in metrics with pytest.raises(RuntimeError, match=r".* reproducibility .* disable this check .*"): controller = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearDeepSpeedTrial, hparams=updated_hparams, workloads=make_workloads(), trial_seed=self.trial_seed, expose_gpus=True, ) controller.run()
def test_custom_dataloader(self) -> None: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=100, validation_freq=10) training_metrics, validation_metrics = trainer.result() # Check the gradient update at every step. for idx, batch_metrics in enumerate(training_metrics): pytorch_onevar_model.OneVarTrial.check_batch_metrics( batch_metrics, idx) # We expect the validation error and training loss to be # monotonically decreasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] <= older["loss"] hparams = dict(self.hparams) hparams["dataloader_type"] = "torch" hparams["disable_dataset_reproducibility_checks"] = True controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_onevar_model.OneVarTrial, hparams=hparams, workloads=make_workloads(), trial_seed=self.trial_seed, ) controller.run()
def test_linear_non_scalar_metrics(self) -> None: updated_hparams = copy.deepcopy(self.hparams) updated_hparams["return_non_scalar_metrics"] = True def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=10, validation_freq=10, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "loss" in metrics controller = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearDeepSpeedTrial, hparams=updated_hparams, workloads=make_workloads(), trial_seed=self.trial_seed, expose_gpus=True, ) controller.run()
def test_end_of_training_hook(self): with tempfile.TemporaryDirectory() as temp_directory: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=2, validation_freq=2, batches_per_step=5) yield workload.terminate_workload( ), [], workload.ignore_workload_response hparams = self.hparams.copy() hparams["training_end"] = os.path.join(temp_directory, "training_end.log") controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialEndOfTrainingHook, hparams=hparams, workloads=make_workloads(), batches_per_step=5, ) controller.run() with open(hparams["training_end"], "r") as fp: assert fp.readline() == "success"
def test_custom_reducer(self) -> None: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() # Test >1 validation to ensure that resetting the allgather_op list is working. yield from trainer.send(steps=2, validation_freq=1, scheduling_unit=1) training_metrics, validation_metrics = trainer.result() label_sum = estimator_linear_model.validation_label_sum() for metrics in validation_metrics: assert metrics["label_sum_tensor_fn"] == label_sum assert metrics["label_sum_tensor_cls"] == label_sum assert metrics["label_sum_list_fn"] == 2 * label_sum assert metrics["label_sum_list_cls"] == 2 * label_sum assert metrics["label_sum_dict_fn"] == 2 * label_sum assert metrics["label_sum_dict_cls"] == 2 * label_sum yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_linear_model.LinearEstimator, hparams=self.hparams, workloads=make_workloads(), trial_seed=0, ) controller.run()
def test_callbacks(self): def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=15, validation_freq=4, scheduling_unit=5) training_metrics, validation_metrics = trainer.result() hparams = { "learning_rate": 0.001, "global_batch_size": 3, "dataset_range": 10, # 15 steps * 5 batches per step * 3 records per batch // 12 records per epoch "epochs": 15 * 5 * 3 // 12, # steps // validation_freq "validations": 3, } exp_config = utils.make_default_exp_config(hparams, scheduling_unit=100, searcher_metric="val_loss") exp_config["records_per_epoch"] = 12 controller = utils.make_trial_controller_from_trial_implementation( tf_keras_one_var_model.OneVarTrial, hparams, make_workloads(), exp_config=exp_config, ) controller.run()
def test_manual_init_distributed(self, manual_init_distributed: None): updated_hparams = copy.deepcopy(self.hparams) updated_hparams["test_manual_init_distributed"] = True def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=10, validation_freq=10, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) training_metrics, validation_metrics = trainer.result() for metrics in validation_metrics: assert "loss" in metrics _ = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearDeepSpeedTrial, hparams=updated_hparams, workloads=make_workloads(), trial_seed=self.trial_seed, expose_gpus=True, ) assert torch.distributed.is_initialized()
def test_ancient_checkpoints(self, ckpt_ver): checkpoint_dir = Path(utils.fixtures_path("ancient-checkpoints")) latest_checkpoint = f"{ckpt_ver}-keras" def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=1) hparams = { "learning_rate": 0.001, "global_batch_size": 3, "dataset_range": 10 } controller = utils.make_trial_controller_from_trial_implementation( ancient_keras_ckpt.AncientTrial, hparams, make_workloads(), trial_seed=self.trial_seed, checkpoint_dir=str(checkpoint_dir), latest_checkpoint=latest_checkpoint, steps_completed=1, ) controller.run()
def test_xor_multi(self) -> None: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1000, validation_freq=100) training_metrics, validation_metrics = trainer.result() # We expect the validation error and training loss to be # monotonically decreasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] <= older["loss"] for older, newer in zip(validation_metrics, validation_metrics[1:]): assert newer["binary_error"] <= older["binary_error"] assert validation_metrics[-1]["binary_error"] == pytest.approx(0.0) yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, workloads=make_workloads(), hparams=self.hparams, trial_seed=self.trial_seed, ) controller.run()
def test_hooks(self) -> None: with tempfile.TemporaryDirectory() as temp_directory: batches_per_step = 5 steps = 10 validation_freq = 5 def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=steps, validation_freq=validation_freq, batches_per_step=batches_per_step) yield workload.terminate_workload( ), [], workload.ignore_workload_response hparams = self.hparams.copy() hparams["training_log_path"] = os.path.join( temp_directory, "training.log") hparams["val_log_path"] = os.path.join(temp_directory, "val.log") controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialWithHooks, hparams=hparams, workloads=make_workloads(), batches_per_step=batches_per_step, ) controller.run() with open(hparams["training_log_path"], "r") as fp: assert int(fp.readline()) == batches_per_step * steps with open(hparams["val_log_path"], "r") as fp: assert int(fp.readline()) == steps / validation_freq
def test_reject_unnamed_nondict_metric(self) -> None: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1, scheduling_unit=1) yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_onevar_model.OneVarTrial, hparams=self.hparams, workloads=make_workloads(), trial_seed=self.trial_seed, ) def reducer_fn(_): return 1.0 # Inject an unnamed metric which returns a non-dict (which is not allowed). controller.context.wrap_reducer(reducer_fn) with pytest.raises(AssertionError, match="name=None but it did not return a dict"): controller.run()
def controller_fn(workloads: workload.Stream) -> det.TrialController: return utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrial, hparams=self.hparams, workloads=workloads, trial_seed=self.trial_seed, )
def test_fail_multiple_set_mpu(self): def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send( steps=1, validation_freq=1, train_batch_calls=self.data_parallel_only_auto_train_batch_calls, ) with pytest.raises( determined.errors.InvalidExperimentException, match=r"Only one MPU can be passed .*" ): controller = utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearDeepSpeedTrial, hparams=self.hparams, workloads=make_workloads(), trial_seed=self.trial_seed, expose_gpus=True, ) controller.context.set_mpu( det_deepspeed.make_data_parallel_mpu(controller.context.distributed) ) controller.context.set_mpu( det_deepspeed.make_data_parallel_mpu(controller.context.distributed) )
def test_variable_workload_size(self) -> None: def make_workloads() -> workload.Stream: training_metrics = [] interceptor = workload.WorkloadResponseInterceptor() total_steps, total_batches_processed = 10, 0 for step_id in range(1, total_steps): num_batches = step_id yield from interceptor.send( workload.train_workload( step_id, num_batches=num_batches, total_batches_processed=total_batches_processed, ), [], ) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] assert len( batch_metrics ) == num_batches, "did not run for expected num_batches" training_metrics.extend(batch_metrics) total_batches_processed += num_batches yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrial, hparams=self.hparams, workloads=make_workloads(), trial_seed=self.trial_seed, ) controller.run()
def test_onevar_single(self) -> None: def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=100, validation_freq=10) training_metrics, validation_metrics = trainer.result() # Check the gradient update at every step. for idx, batch_metrics in enumerate(training_metrics): pytorch_onevar_model.OneVarTrial.check_batch_metrics( batch_metrics, idx) # We expect the validation error and training loss to be # monotonically decreasing. for older, newer in zip(training_metrics, training_metrics[1:]): assert newer["loss"] <= older["loss"] yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_onevar_model.OneVarTrial, hparams=self.hparams, workloads=make_workloads(), trial_seed=self.trial_seed, ) controller.run()
def test_fail_restore_invalid_checkpoint(self, tmp_path: pathlib.Path) -> None: # Build, train, and save a checkpoint with the normal hyperparameters. checkpoint_dir = tmp_path.joinpath("checkpoint") def make_workloads_1() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload( ), [], workload.ignore_workload_response controller1 = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=self.hparams, workloads=make_workloads_1(), trial_seed=self.trial_seed, ) controller1.run() # Verify that an invalid architecture fails to load from the checkpoint. def make_workloads_2() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload( ), [], workload.ignore_workload_response hparams2 = { "hidden_size": 3, "learning_rate": 0.5, "global_batch_size": 4 } with pytest.raises(RuntimeError): controller2 = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=hparams2, workloads=make_workloads_2(), load_path=checkpoint_dir, trial_seed=self.trial_seed, ) controller2.run()
def controller_fn(workloads: workload.Stream) -> determined.TrialController: return utils.make_trial_controller_from_trial_implementation( trial_class=deepspeed_linear_model.LinearPipelineEngineTrial, hparams=self.hparams, workloads=workloads, trial_seed=self.trial_seed, expose_gpus=True, )
def test_restore_invalid_checkpoint(self, tmp_path: pathlib.Path) -> None: # Build, train, and save a checkpoint with the normal hyperparameters. checkpoint_dir = str(tmp_path.joinpath("checkpoint")) latest_checkpoint = None steps_completed = 0 def make_workloads_1() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = trainer.get_steps_completed() controller1 = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=self.hparams, workloads=make_workloads_1(), trial_seed=self.trial_seed, checkpoint_dir=checkpoint_dir, ) controller1.run() # Verify that an invalid architecture fails to load from the checkpoint. def make_workloads_2() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=1, validation_freq=1) hparams2 = { "hidden_size": 3, "learning_rate": 0.5, "global_batch_size": 4 } with pytest.raises(RuntimeError): controller2 = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialMulti, hparams=hparams2, workloads=make_workloads_2(), trial_seed=self.trial_seed, checkpoint_dir=checkpoint_dir, latest_checkpoint=latest_checkpoint, steps_completed=steps_completed, ) controller2.run()
def test_custom_hook(self, tmp_path: Path) -> None: checkpoint_dir = str(tmp_path.joinpath("checkpoint")) latest_checkpoint = None steps_completed = 0 def make_workloads() -> workload.Stream: trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=5, scheduling_unit=5) interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = trainer.get_steps_completed() def verify_callback(checkpoint_dir: str, checkpoint_num: int) -> None: with open(os.path.join(checkpoint_dir, "custom.log"), "r") as fp: assert int(fp.readline()) == checkpoint_num controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialWithCustomHook, hparams=self.hparams, workloads=make_workloads(), scheduling_unit=5, checkpoint_dir=checkpoint_dir, ) controller.run() verify_callback(os.path.join(checkpoint_dir, latest_checkpoint), checkpoint_num=1) controller = utils.make_trial_controller_from_trial_implementation( trial_class=estimator_xor_model.XORTrialWithCustomHook, hparams=self.hparams, workloads=make_workloads(), scheduling_unit=5, checkpoint_dir=checkpoint_dir, latest_checkpoint=latest_checkpoint, steps_completed=steps_completed, ) controller.run() verify_callback(os.path.join(checkpoint_dir, latest_checkpoint), checkpoint_num=2)
def test_lr_schedule_and_lr_checkpoint(self, tmp_path: pathlib.Path) -> None: checkpoint_dir = tmp_path.joinpath("checkpoint") training_metrics = [] def make_workloads(checkpoint_dir: str = "") -> workload.Stream: nonlocal training_metrics trainer = utils.TrainAndValidate() yield from trainer.send(steps=10, validation_freq=10, batches_per_step=1) tm, _ = trainer.result() training_metrics += tm if checkpoint_dir: yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response yield workload.terminate_workload( ), [], workload.ignore_workload_response controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialRestoreLR, hparams=self.hparams, workloads=make_workloads(checkpoint_dir), trial_seed=self.trial_seed, ) controller.run() controller = utils.make_trial_controller_from_trial_implementation( trial_class=pytorch_xor_model.XORTrialRestoreLR, hparams=self.hparams, workloads=make_workloads(), load_path=checkpoint_dir, trial_seed=self.trial_seed, ) controller.run() lrs = [metric["lr"] for metric in training_metrics] for i in range(1, len(lrs)): assert lrs[i] == lrs[i - 1] + 1
def make_trial_controller_fn( workloads: workload.Stream, load_path: typing.Optional[str] = None) -> det.TrialController: return utils.make_trial_controller_from_trial_implementation( trial_class=la_model.OneVarTrial, hparams=self.hparams, workloads=workloads, load_path=load_path, trial_seed=self.trial_seed, )