def test_one_var_training(self, test_checkpointing, tmp_path):
        checkpoint_dir = tmp_path.joinpath("checkpoint")

        # In the test_checkpointing case, we will call make_workloads() twice but batches and w
        # will persist across both calls.
        batches = enumerate([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]])
        w = 0.0

        trial_class = tf_keras_one_var_model.OneVarTrial

        def make_workloads() -> workload.Stream:
            nonlocal w
            interceptor = workload.WorkloadResponseInterceptor()

            for idx, batch in batches:
                yield from interceptor.send(workload.train_workload(1), [])
                metrics = interceptor.metrics_result()

                # Calculate what the loss should be.
                loss = trial_class.calc_loss(w, batch)

                epsilon = 0.0001
                assert abs(metrics["metrics"]["avg_metrics"]["loss"] - loss) < epsilon

                # Update what the weight should be.
                w = w - hparams["learning_rate"] * trial_class.calc_gradient(w, batch)

                if test_checkpointing and idx == 3:
                    # Checkpoint and let the next TrialController finish the work.l
                    yield workload.checkpoint_workload(), [
                        checkpoint_dir
                    ], workload.ignore_workload_response
                    break

            yield workload.terminate_workload(), [], workload.ignore_workload_response

        hparams = {"learning_rate": 0.001, "global_batch_size": 3, "dataset_range": 10}
        exp_config = utils.make_default_exp_config(hparams, scheduling_unit=100)
        exp_config["records_per_epoch"] = 100
        # TODO(DET-2436): Add a unit test for native implementation with tf dataset.
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class,
            hparams,
            make_workloads(),
            exp_config=exp_config,
            trial_seed=self.trial_seed,
        )
        controller.run()

        # In the checkpointing case, we need to create another controller to finish training.
        if test_checkpointing:
            controller = utils.make_trial_controller_from_trial_implementation(
                trial_class,
                hparams,
                make_workloads(),
                exp_config=exp_config,
                load_path=checkpoint_dir,
                trial_seed=self.trial_seed,
            )
            controller.run()
    def test_custom_hook(self, tmp_path: Path) -> None:
        def make_workloads(checkpoint_dir: pathlib.Path) -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=5, batches_per_step=5)
            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            yield workload.terminate_workload(), [], workload.ignore_workload_response

        def verify_callback(checkpoint_dir: pathlib.Path, checkpoint_num: int) -> None:
            with open(str(checkpoint_dir.joinpath("custom.log")), "r") as fp:
                assert int(fp.readline()) == checkpoint_num

        checkpoint_dir1 = tmp_path.joinpath("checkpoint1")
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=estimator_xor_model.XORTrialWithCustomHook,
            hparams=self.hparams,
            workloads=make_workloads(checkpoint_dir=checkpoint_dir1),
            batches_per_step=5,
        )
        controller.run()
        verify_callback(checkpoint_dir=checkpoint_dir1, checkpoint_num=1)

        checkpoint_dir2 = tmp_path.joinpath("checkpoint2")
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=estimator_xor_model.XORTrialWithCustomHook,
            hparams=self.hparams,
            workloads=make_workloads(checkpoint_dir=checkpoint_dir2),
            batches_per_step=5,
            load_path=checkpoint_dir1,
        )
        controller.run()
        verify_callback(checkpoint_dir=checkpoint_dir2, checkpoint_num=2)
    def test_custom_eval(self) -> None:
        training_metrics = {}
        validation_metrics = {}

        def make_workloads(tag: str) -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=900, validation_freq=100)
            tm, vm = trainer.result()
            training_metrics[tag] = tm
            validation_metrics[tag] = vm

            yield workload.terminate_workload(), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrial,
            hparams=self.hparams,
            workloads=make_workloads("A"),
            trial_seed=self.trial_seed,
        )
        controller.run()

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialCustomEval,
            hparams=self.hparams,
            workloads=make_workloads("B"),
            trial_seed=self.trial_seed,
        )
        controller.run()

        for original, custom_eval in zip(training_metrics["A"], training_metrics["B"]):
            assert original["loss"] == custom_eval["loss"]

        for original, custom_eval in zip(validation_metrics["A"], validation_metrics["B"]):
            assert original["loss"] == custom_eval["loss"]
Beispiel #4
0
    def test_callbacks(self, tmp_path: pathlib.Path) -> None:
        checkpoint_dir = tmp_path.joinpath("checkpoint")

        controller = None  # type: ignore

        def make_workloads1() -> workload.Stream:
            nonlocal controller

            yield workload.train_workload(1, 1, 0), [], workload.ignore_workload_response
            assert controller is not None, "controller was never set!"
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 0,
                "validation_steps_ended": 0,
                "checkpoints_ended": 0,
            }

            yield workload.validation_workload(), [], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 0,
            }

            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 1,
            }

            yield workload.terminate_workload(), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialCallbacks,
            hparams=self.hparams,
            workloads=make_workloads1(),
        )
        controller.run()

        # Verify the checkpoint loading callback works.

        def make_workloads2() -> workload.Stream:
            yield workload.terminate_workload(), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialCallbacks,
            hparams=self.hparams,
            workloads=make_workloads2(),
            load_path=checkpoint_dir,
        )
        controller.run()
        assert controller.trial.counter.__dict__ == {
            "validation_steps_started": 1,
            "validation_steps_ended": 1,
            "checkpoints_ended": 0,
        }
Beispiel #5
0
    def test_grad_clipping(self) -> None:
        training_metrics = {}
        validation_metrics = {}

        def make_workloads(tag: str) -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=1000, validation_freq=100)
            tm, vm = trainer.result()
            training_metrics[tag] = tm
            validation_metrics[tag] = vm

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialGradClipping,
            hparams=self.hparams,
            workloads=make_workloads("original"),
            trial_seed=self.trial_seed,
        )
        controller.run()

        updated_hparams = {"gradient_clipping_l2_norm": 0.0001, **self.hparams}
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialGradClipping,
            hparams=updated_hparams,
            workloads=make_workloads("clipped_by_norm"),
            trial_seed=self.trial_seed,
        )
        controller.run()

        for idx, (original, clipped) in enumerate(
                zip(training_metrics["original"],
                    training_metrics["clipped_by_norm"])):
            if idx < 10:
                continue
            assert original["loss"] != clipped["loss"]

        updated_hparams = {"gradient_clipping_value": 0.0001, **self.hparams}
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialGradClipping,
            hparams=updated_hparams,
            workloads=make_workloads("clipped_by_val"),
            trial_seed=self.trial_seed,
        )
        controller.run()

        for idx, (original, clipped) in enumerate(
                zip(training_metrics["original"],
                    training_metrics["clipped_by_val"])):
            if idx < 10:
                continue
            assert original["loss"] != clipped["loss"]
    def test_checkpointing(self, tmp_path: pathlib.Path) -> None:
        checkpoint_dir = tmp_path.joinpath("checkpoint")

        old_error = -1

        def make_workloads_1() -> workload.Stream:
            nonlocal old_error

            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()
            old_error = validation_metrics[-1]["binary_error"]

            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialMulti,
            hparams=self.hparams,
            workloads=make_workloads_1(),
            trial_seed=self.trial_seed,
        )
        controller.run()

        # Restore the checkpoint on a new trial instance and recompute
        # validation. The validation error should be the same as it was
        # previously.
        def make_workloads_2() -> workload.Stream:
            interceptor = workload.WorkloadResponseInterceptor()

            yield from interceptor.send(workload.validation_workload(), [])
            metrics = interceptor.metrics_result()

            new_error = metrics["metrics"]["validation_metrics"][
                "binary_error"]
            assert new_error == pytest.approx(old_error)

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialMulti,
            hparams=self.hparams,
            workloads=make_workloads_2(),
            load_path=checkpoint_dir,
            trial_seed=self.trial_seed,
        )
        controller.run()
Beispiel #7
0
    def test_restore_invalid_checkpoint(self, tmp_path: pathlib.Path) -> None:
        # Build, train, and save a checkpoint with the normal hyperparameters.
        checkpoint_dir = str(tmp_path.joinpath("checkpoint"))
        latest_checkpoint = None
        steps_completed = 0

        def make_workloads_1() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(
                steps=1,
                validation_freq=1,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )
            interceptor = workload.WorkloadResponseInterceptor()
            yield from interceptor.send(workload.checkpoint_workload())
            nonlocal latest_checkpoint, steps_completed
            latest_checkpoint = interceptor.metrics_result()["uuid"]
            steps_completed = trainer.get_steps_completed()

        controller1 = utils.make_trial_controller_from_trial_implementation(
            trial_class=deepspeed_linear_model.LinearDeepSpeedTrial,
            hparams=self.hparams,
            workloads=make_workloads_1(),
            trial_seed=self.trial_seed,
            checkpoint_dir=checkpoint_dir,
            expose_gpus=True,
        )
        controller1.run()

        # Verify that an invalid architecture fails to load from the checkpoint.
        def make_workloads_2() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(
                steps=1,
                validation_freq=1,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )

        with pytest.raises(AssertionError, match="Failed to load deepspeed checkpoint."):
            controller2 = utils.make_trial_controller_from_trial_implementation(
                trial_class=deepspeed_linear_model.LinearTwoEngineTrial,
                hparams=self.hparams,
                workloads=make_workloads_2(),
                trial_seed=self.trial_seed,
                checkpoint_dir=checkpoint_dir,
                latest_checkpoint=latest_checkpoint,
                steps_completed=steps_completed,
                expose_gpus=True,
            )
            controller2.run()
    def test_callbacks(self, tmp_path: pathlib.Path) -> None:
        checkpoint_dir = tmp_path.joinpath("checkpoint")
        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialCallbacks,
            hparams=self.hparams,
            workloads=[])
        controller._train_for_step(1, 1)
        assert controller.trial.counter.__dict__ == {
            "train_steps_started": 1,
            "train_steps_ended": 1,
            "validation_steps_started": 0,
            "validation_steps_ended": 0,
            "checkpoints_ended": 0,
        }

        controller._compute_validation_metrics()
        assert controller.trial.counter.__dict__ == {
            "train_steps_started": 1,
            "train_steps_ended": 1,
            "validation_steps_started": 1,
            "validation_steps_ended": 1,
            "checkpoints_ended": 0,
        }

        controller._save(checkpoint_dir)
        assert controller.trial.counter.__dict__ == {
            "train_steps_started": 1,
            "train_steps_ended": 1,
            "validation_steps_started": 1,
            "validation_steps_ended": 1,
            "checkpoints_ended": 1,
        }

        del controller

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialCallbacks,
            hparams=self.hparams,
            workloads=[],
            load_path=checkpoint_dir,
        )
        controller._load()
        assert controller.trial.counter.__dict__ == {
            "train_steps_started": 1,
            "train_steps_ended": 1,
            "validation_steps_started": 1,
            "validation_steps_ended": 1,
            "checkpoints_ended": 0,
        }
Beispiel #9
0
    def test_reject_named_dict_metric(self) -> None:
        # If at some point in the future the webui is able to render scalar metrics inside of
        # nested dictionary metrics, this test could go away.

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1,
                                    validation_freq=1,
                                    scheduling_unit=1)
            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_onevar_model.OneVarTrial,
            hparams=self.hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
        )

        def reducer_fn(_):
            return {"my_metric": 1.0}

        # Inject a named metric which returns a dict (which is not allowed).
        controller.context.wrap_reducer(reducer_fn, name="my_metric")

        with pytest.raises(
                AssertionError,
                match="with name set but it returned a dict anyway"):
            controller.run()
Beispiel #10
0
    def test_fail_dataset_repro_check(self) -> None:
        updated_hparams = copy.deepcopy(self.hparams)
        updated_hparams["test_fail_dataset_repro_check"] = True

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(
                steps=10,
                validation_freq=10,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )
            training_metrics, validation_metrics = trainer.result()

            for metrics in validation_metrics:
                assert "loss" in metrics

        with pytest.raises(RuntimeError, match=r".* reproducibility .* disable this check .*"):
            controller = utils.make_trial_controller_from_trial_implementation(
                trial_class=deepspeed_linear_model.LinearDeepSpeedTrial,
                hparams=updated_hparams,
                workloads=make_workloads(),
                trial_seed=self.trial_seed,
                expose_gpus=True,
            )
            controller.run()
    def test_custom_dataloader(self) -> None:
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=100, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()

            # Check the gradient update at every step.
            for idx, batch_metrics in enumerate(training_metrics):
                pytorch_onevar_model.OneVarTrial.check_batch_metrics(
                    batch_metrics, idx)

            # We expect the validation error and training loss to be
            # monotonically decreasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] <= older["loss"]

        hparams = dict(self.hparams)
        hparams["dataloader_type"] = "torch"
        hparams["disable_dataset_reproducibility_checks"] = True

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_onevar_model.OneVarTrial,
            hparams=hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
        )
        controller.run()
Beispiel #12
0
    def test_linear_non_scalar_metrics(self) -> None:
        updated_hparams = copy.deepcopy(self.hparams)
        updated_hparams["return_non_scalar_metrics"] = True

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(
                steps=10,
                validation_freq=10,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )
            training_metrics, validation_metrics = trainer.result()

            for metrics in validation_metrics:
                assert "loss" in metrics

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=deepspeed_linear_model.LinearDeepSpeedTrial,
            hparams=updated_hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
            expose_gpus=True,
        )
        controller.run()
    def test_end_of_training_hook(self):
        with tempfile.TemporaryDirectory() as temp_directory:

            def make_workloads() -> workload.Stream:
                trainer = utils.TrainAndValidate()

                yield from trainer.send(steps=2,
                                        validation_freq=2,
                                        batches_per_step=5)
                yield workload.terminate_workload(
                ), [], workload.ignore_workload_response

            hparams = self.hparams.copy()
            hparams["training_end"] = os.path.join(temp_directory,
                                                   "training_end.log")

            controller = utils.make_trial_controller_from_trial_implementation(
                trial_class=estimator_xor_model.XORTrialEndOfTrainingHook,
                hparams=hparams,
                workloads=make_workloads(),
                batches_per_step=5,
            )
            controller.run()

            with open(hparams["training_end"], "r") as fp:
                assert fp.readline() == "success"
    def test_custom_reducer(self) -> None:
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            # Test >1 validation to ensure that resetting the allgather_op list is working.
            yield from trainer.send(steps=2,
                                    validation_freq=1,
                                    scheduling_unit=1)
            training_metrics, validation_metrics = trainer.result()

            label_sum = estimator_linear_model.validation_label_sum()
            for metrics in validation_metrics:
                assert metrics["label_sum_tensor_fn"] == label_sum
                assert metrics["label_sum_tensor_cls"] == label_sum
                assert metrics["label_sum_list_fn"] == 2 * label_sum
                assert metrics["label_sum_list_cls"] == 2 * label_sum
                assert metrics["label_sum_dict_fn"] == 2 * label_sum
                assert metrics["label_sum_dict_cls"] == 2 * label_sum

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=estimator_linear_model.LinearEstimator,
            hparams=self.hparams,
            workloads=make_workloads(),
            trial_seed=0,
        )
        controller.run()
Beispiel #15
0
    def test_callbacks(self):
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=15,
                                    validation_freq=4,
                                    scheduling_unit=5)
            training_metrics, validation_metrics = trainer.result()

        hparams = {
            "learning_rate": 0.001,
            "global_batch_size": 3,
            "dataset_range": 10,
            # 15 steps * 5 batches per step * 3 records per batch // 12 records per epoch
            "epochs": 15 * 5 * 3 // 12,
            # steps // validation_freq
            "validations": 3,
        }
        exp_config = utils.make_default_exp_config(hparams,
                                                   scheduling_unit=100,
                                                   searcher_metric="val_loss")
        exp_config["records_per_epoch"] = 12

        controller = utils.make_trial_controller_from_trial_implementation(
            tf_keras_one_var_model.OneVarTrial,
            hparams,
            make_workloads(),
            exp_config=exp_config,
        )
        controller.run()
Beispiel #16
0
    def test_manual_init_distributed(self, manual_init_distributed: None):
        updated_hparams = copy.deepcopy(self.hparams)
        updated_hparams["test_manual_init_distributed"] = True

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(
                steps=10,
                validation_freq=10,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )
            training_metrics, validation_metrics = trainer.result()

            for metrics in validation_metrics:
                assert "loss" in metrics

        _ = utils.make_trial_controller_from_trial_implementation(
            trial_class=deepspeed_linear_model.LinearDeepSpeedTrial,
            hparams=updated_hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
            expose_gpus=True,
        )
        assert torch.distributed.is_initialized()
Beispiel #17
0
    def test_ancient_checkpoints(self, ckpt_ver):
        checkpoint_dir = Path(utils.fixtures_path("ancient-checkpoints"))
        latest_checkpoint = f"{ckpt_ver}-keras"

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1,
                                    validation_freq=1,
                                    scheduling_unit=1)

        hparams = {
            "learning_rate": 0.001,
            "global_batch_size": 3,
            "dataset_range": 10
        }
        controller = utils.make_trial_controller_from_trial_implementation(
            ancient_keras_ckpt.AncientTrial,
            hparams,
            make_workloads(),
            trial_seed=self.trial_seed,
            checkpoint_dir=str(checkpoint_dir),
            latest_checkpoint=latest_checkpoint,
            steps_completed=1,
        )
        controller.run()
Beispiel #18
0
    def test_xor_multi(self) -> None:
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=1000, validation_freq=100)
            training_metrics, validation_metrics = trainer.result()

            # We expect the validation error and training loss to be
            # monotonically decreasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] <= older["loss"]

            for older, newer in zip(validation_metrics,
                                    validation_metrics[1:]):
                assert newer["binary_error"] <= older["binary_error"]

            assert validation_metrics[-1]["binary_error"] == pytest.approx(0.0)

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialMulti,
            workloads=make_workloads(),
            hparams=self.hparams,
            trial_seed=self.trial_seed,
        )
        controller.run()
    def test_hooks(self) -> None:
        with tempfile.TemporaryDirectory() as temp_directory:
            batches_per_step = 5
            steps = 10
            validation_freq = 5

            def make_workloads() -> workload.Stream:
                trainer = utils.TrainAndValidate()

                yield from trainer.send(steps=steps,
                                        validation_freq=validation_freq,
                                        batches_per_step=batches_per_step)
                yield workload.terminate_workload(
                ), [], workload.ignore_workload_response

            hparams = self.hparams.copy()
            hparams["training_log_path"] = os.path.join(
                temp_directory, "training.log")
            hparams["val_log_path"] = os.path.join(temp_directory, "val.log")

            controller = utils.make_trial_controller_from_trial_implementation(
                trial_class=estimator_xor_model.XORTrialWithHooks,
                hparams=hparams,
                workloads=make_workloads(),
                batches_per_step=batches_per_step,
            )
            controller.run()

            with open(hparams["training_log_path"], "r") as fp:
                assert int(fp.readline()) == batches_per_step * steps

            with open(hparams["val_log_path"], "r") as fp:
                assert int(fp.readline()) == steps / validation_freq
Beispiel #20
0
    def test_reject_unnamed_nondict_metric(self) -> None:
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1,
                                    validation_freq=1,
                                    scheduling_unit=1)
            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_onevar_model.OneVarTrial,
            hparams=self.hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
        )

        def reducer_fn(_):
            return 1.0

        # Inject an unnamed metric which returns a non-dict (which is not allowed).
        controller.context.wrap_reducer(reducer_fn)

        with pytest.raises(AssertionError,
                           match="name=None but it did not return a dict"):
            controller.run()
Beispiel #21
0
 def controller_fn(workloads: workload.Stream) -> det.TrialController:
     return utils.make_trial_controller_from_trial_implementation(
         trial_class=pytorch_xor_model.XORTrial,
         hparams=self.hparams,
         workloads=workloads,
         trial_seed=self.trial_seed,
     )
Beispiel #22
0
    def test_fail_multiple_set_mpu(self):
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(
                steps=1,
                validation_freq=1,
                train_batch_calls=self.data_parallel_only_auto_train_batch_calls,
            )

        with pytest.raises(
            determined.errors.InvalidExperimentException, match=r"Only one MPU can be passed .*"
        ):
            controller = utils.make_trial_controller_from_trial_implementation(
                trial_class=deepspeed_linear_model.LinearDeepSpeedTrial,
                hparams=self.hparams,
                workloads=make_workloads(),
                trial_seed=self.trial_seed,
                expose_gpus=True,
            )
            controller.context.set_mpu(
                det_deepspeed.make_data_parallel_mpu(controller.context.distributed)
            )
            controller.context.set_mpu(
                det_deepspeed.make_data_parallel_mpu(controller.context.distributed)
            )
Beispiel #23
0
    def test_variable_workload_size(self) -> None:
        def make_workloads() -> workload.Stream:
            training_metrics = []
            interceptor = workload.WorkloadResponseInterceptor()

            total_steps, total_batches_processed = 10, 0
            for step_id in range(1, total_steps):
                num_batches = step_id
                yield from interceptor.send(
                    workload.train_workload(
                        step_id,
                        num_batches=num_batches,
                        total_batches_processed=total_batches_processed,
                    ),
                    [],
                )
                metrics = interceptor.metrics_result()
                batch_metrics = metrics["metrics"]["batch_metrics"]
                assert len(
                    batch_metrics
                ) == num_batches, "did not run for expected num_batches"
                training_metrics.extend(batch_metrics)
                total_batches_processed += num_batches

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrial,
            hparams=self.hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
        )
        controller.run()
Beispiel #24
0
    def test_onevar_single(self) -> None:
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=100, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()

            # Check the gradient update at every step.
            for idx, batch_metrics in enumerate(training_metrics):
                pytorch_onevar_model.OneVarTrial.check_batch_metrics(
                    batch_metrics, idx)

            # We expect the validation error and training loss to be
            # monotonically decreasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] <= older["loss"]

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_onevar_model.OneVarTrial,
            hparams=self.hparams,
            workloads=make_workloads(),
            trial_seed=self.trial_seed,
        )
        controller.run()
    def test_fail_restore_invalid_checkpoint(self,
                                             tmp_path: pathlib.Path) -> None:
        # Build, train, and save a checkpoint with the normal hyperparameters.
        checkpoint_dir = tmp_path.joinpath("checkpoint")

        def make_workloads_1() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1, validation_freq=1)
            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller1 = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialMulti,
            hparams=self.hparams,
            workloads=make_workloads_1(),
            trial_seed=self.trial_seed,
        )
        controller1.run()

        # Verify that an invalid architecture fails to load from the checkpoint.
        def make_workloads_2() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1, validation_freq=1)
            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        hparams2 = {
            "hidden_size": 3,
            "learning_rate": 0.5,
            "global_batch_size": 4
        }

        with pytest.raises(RuntimeError):
            controller2 = utils.make_trial_controller_from_trial_implementation(
                trial_class=pytorch_xor_model.XORTrialMulti,
                hparams=hparams2,
                workloads=make_workloads_2(),
                load_path=checkpoint_dir,
                trial_seed=self.trial_seed,
            )
            controller2.run()
Beispiel #26
0
 def controller_fn(workloads: workload.Stream) -> determined.TrialController:
     return utils.make_trial_controller_from_trial_implementation(
         trial_class=deepspeed_linear_model.LinearPipelineEngineTrial,
         hparams=self.hparams,
         workloads=workloads,
         trial_seed=self.trial_seed,
         expose_gpus=True,
     )
    def test_restore_invalid_checkpoint(self, tmp_path: pathlib.Path) -> None:
        # Build, train, and save a checkpoint with the normal hyperparameters.
        checkpoint_dir = str(tmp_path.joinpath("checkpoint"))
        latest_checkpoint = None
        steps_completed = 0

        def make_workloads_1() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1, validation_freq=1)
            interceptor = workload.WorkloadResponseInterceptor()
            yield from interceptor.send(workload.checkpoint_workload())
            nonlocal latest_checkpoint, steps_completed
            latest_checkpoint = interceptor.metrics_result()["uuid"]
            steps_completed = trainer.get_steps_completed()

        controller1 = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialMulti,
            hparams=self.hparams,
            workloads=make_workloads_1(),
            trial_seed=self.trial_seed,
            checkpoint_dir=checkpoint_dir,
        )
        controller1.run()

        # Verify that an invalid architecture fails to load from the checkpoint.
        def make_workloads_2() -> workload.Stream:
            trainer = utils.TrainAndValidate()
            yield from trainer.send(steps=1, validation_freq=1)

        hparams2 = {
            "hidden_size": 3,
            "learning_rate": 0.5,
            "global_batch_size": 4
        }

        with pytest.raises(RuntimeError):
            controller2 = utils.make_trial_controller_from_trial_implementation(
                trial_class=pytorch_xor_model.XORTrialMulti,
                hparams=hparams2,
                workloads=make_workloads_2(),
                trial_seed=self.trial_seed,
                checkpoint_dir=checkpoint_dir,
                latest_checkpoint=latest_checkpoint,
                steps_completed=steps_completed,
            )
            controller2.run()
    def test_custom_hook(self, tmp_path: Path) -> None:
        checkpoint_dir = str(tmp_path.joinpath("checkpoint"))
        latest_checkpoint = None
        steps_completed = 0

        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10,
                                    validation_freq=5,
                                    scheduling_unit=5)

            interceptor = workload.WorkloadResponseInterceptor()
            yield from interceptor.send(workload.checkpoint_workload())
            nonlocal latest_checkpoint, steps_completed
            latest_checkpoint = interceptor.metrics_result()["uuid"]
            steps_completed = trainer.get_steps_completed()

        def verify_callback(checkpoint_dir: str, checkpoint_num: int) -> None:
            with open(os.path.join(checkpoint_dir, "custom.log"), "r") as fp:
                assert int(fp.readline()) == checkpoint_num

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=estimator_xor_model.XORTrialWithCustomHook,
            hparams=self.hparams,
            workloads=make_workloads(),
            scheduling_unit=5,
            checkpoint_dir=checkpoint_dir,
        )
        controller.run()
        verify_callback(os.path.join(checkpoint_dir, latest_checkpoint),
                        checkpoint_num=1)

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=estimator_xor_model.XORTrialWithCustomHook,
            hparams=self.hparams,
            workloads=make_workloads(),
            scheduling_unit=5,
            checkpoint_dir=checkpoint_dir,
            latest_checkpoint=latest_checkpoint,
            steps_completed=steps_completed,
        )
        controller.run()
        verify_callback(os.path.join(checkpoint_dir, latest_checkpoint),
                        checkpoint_num=2)
    def test_lr_schedule_and_lr_checkpoint(self,
                                           tmp_path: pathlib.Path) -> None:
        checkpoint_dir = tmp_path.joinpath("checkpoint")
        training_metrics = []

        def make_workloads(checkpoint_dir: str = "") -> workload.Stream:
            nonlocal training_metrics

            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10,
                                    validation_freq=10,
                                    batches_per_step=1)
            tm, _ = trainer.result()
            training_metrics += tm

            if checkpoint_dir:
                yield workload.checkpoint_workload(), [
                    checkpoint_dir
                ], workload.ignore_workload_response

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialRestoreLR,
            hparams=self.hparams,
            workloads=make_workloads(checkpoint_dir),
            trial_seed=self.trial_seed,
        )
        controller.run()

        controller = utils.make_trial_controller_from_trial_implementation(
            trial_class=pytorch_xor_model.XORTrialRestoreLR,
            hparams=self.hparams,
            workloads=make_workloads(),
            load_path=checkpoint_dir,
            trial_seed=self.trial_seed,
        )
        controller.run()

        lrs = [metric["lr"] for metric in training_metrics]
        for i in range(1, len(lrs)):
            assert lrs[i] == lrs[i - 1] + 1
 def make_trial_controller_fn(
         workloads: workload.Stream,
         load_path: typing.Optional[str] = None) -> det.TrialController:
     return utils.make_trial_controller_from_trial_implementation(
         trial_class=la_model.OneVarTrial,
         hparams=self.hparams,
         workloads=workloads,
         load_path=load_path,
         trial_seed=self.trial_seed,
     )