Exemple #1
0
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=3,
                                    validation_freq=3,
                                    scheduling_unit=10)
            training_metrics = trainer.get_avg_training_metrics()
            _, validation_metrics = trainer.result()

            batch_size = self.hparams["global_batch_size"]

            for i, metrics in enumerate(training_metrics):
                expect = pytorch_onevar_model.TriangleLabelSum.expect(
                    batch_size, 10 * i, 10 * (i + 1))
                assert "cls_reducer" in metrics
                assert metrics["cls_reducer"] == expect
                assert "fn_reducer" in metrics
                assert metrics["fn_reducer"] == expect

            for metrics in validation_metrics:
                num_batches = len(
                    pytorch_onevar_model.OnesDataset()) // batch_size
                expect = pytorch_onevar_model.TriangleLabelSum.expect(
                    batch_size, 0, num_batches)
                assert "cls_reducer" in metrics
                assert metrics["cls_reducer"] == expect
                assert "fn_reducer" in metrics
                assert metrics["fn_reducer"] == expect

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
Exemple #2
0
 def make_workloads() -> workload.Stream:
     trainer = utils.TrainAndValidate()
     yield from trainer.send(steps=2,
                             validation_freq=1,
                             scheduling_unit=1)
     yield workload.terminate_workload(
     ), [], workload.ignore_workload_response
Exemple #3
0
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10,
                                    validation_freq=1,
                                    batches_per_step=100)
            training_metrics, validation_metrics = trainer.result()

            # We expect the validation error and training loss to be
            # monotonically decreasing.

            # TODO(DET-1597): actually use a model and optimizer where the losses
            # are monotonically decreasing.
            for older, newer in zip(training_metrics[::100],
                                    training_metrics[::100][1:]):
                assert newer["loss"] <= older["loss"]

            for older, newer in zip(validation_metrics,
                                    validation_metrics[1:]):
                assert newer["val_categorical_error"] <= older[
                    "val_categorical_error"]

            assert validation_metrics[-1][
                "val_categorical_error"] == pytest.approx(0.0)

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
Exemple #4
0
def make_test_workloads(
    checkpoint_dir: pathlib.Path, config: det.ExperimentConfig
) -> workload.Stream:
    print("Start training a test experiment.")
    interceptor = workload.WorkloadResponseInterceptor()

    print("Training 1 step.")
    yield from interceptor.send(workload.train_workload(1), [config.batches_per_step()])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["batch_metrics"]
    check.eq(len(batch_metrics), config.batches_per_step())
    print(f"Finished training. Metrics: {batch_metrics}")

    print("Validating.")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["validation_metrics"]
    print(f"Finished validating. Validation metrics: {v_metrics}")

    print(f"Saving a checkpoint to {checkpoint_dir}")
    yield workload.checkpoint_workload(), [checkpoint_dir], workload.ignore_workload_response
    print(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    print("The test experiment passed.")
Exemple #5
0
        def make_workloads() -> workload.Stream:
            nonlocal w
            interceptor = workload.WorkloadResponseInterceptor()

            for idx, batch in batches:
                yield from interceptor.send(workload.train_workload(1), [])
                metrics = interceptor.metrics_result()

                # Calculate what the loss should be.
                loss = trial_class.calc_loss(w, batch)

                assert metrics["metrics"]["avg_metrics"][
                    "loss"] == pytest.approx(loss)

                # Update what the weight should be.
                w = w - hparams["learning_rate"] * trial_class.calc_gradient(
                    w, batch)

                if test_checkpointing and idx == 3:
                    # Checkpoint and let the next TrialController finish the work.l
                    yield workload.checkpoint_workload(), [
                        checkpoint_dir
                    ], workload.ignore_workload_response
                    break

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
Exemple #6
0
 def make_workloads() -> workload.Stream:
     trainer = utils.TrainAndValidate()
     yield from trainer.send(steps=1,
                             validation_freq=1,
                             batches_per_step=1)
     yield workload.terminate_workload(
     ), [], workload.ignore_workload_response
Exemple #7
0
        def make_workloads1() -> workload.Stream:
            nonlocal controller

            yield workload.train_workload(1, 1, 0), [], workload.ignore_workload_response
            assert controller is not None, "controller was never set!"
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 0,
                "validation_steps_ended": 0,
                "checkpoints_ended": 0,
            }

            yield workload.validation_workload(), [], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 0,
            }

            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 1,
            }

            yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #8
0
        def make_workloads() -> workload.Stream:
            training_metrics = []
            interceptor = workload.WorkloadResponseInterceptor()

            total_steps, total_batches_processed = 10, 0
            for step_id in range(1, total_steps):
                num_batches = step_id
                yield from interceptor.send(
                    workload.train_workload(
                        step_id,
                        num_batches=num_batches,
                        total_batches_processed=total_batches_processed,
                    ),
                    [],
                )
                metrics = interceptor.metrics_result()
                batch_metrics = metrics["metrics"]["batch_metrics"]
                assert len(
                    batch_metrics
                ) == num_batches, "did not run for expected num_batches"
                training_metrics.extend(batch_metrics)
                total_batches_processed += num_batches

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
 def make_workloads_2() -> workload.Stream:
     trainer = utils.TrainAndValidate()
     yield from trainer.send(steps=1, validation_freq=1)
     yield workload.checkpoint_workload(), [
         checkpoint_dir
     ], workload.ignore_workload_response
     yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads(checkpoint_dir: pathlib.Path) -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=5, batches_per_step=5)
            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            yield workload.terminate_workload(), [], workload.ignore_workload_response
 def make_workloads() -> workload.Stream:
     trainer = utils.TrainAndValidate(request_stop_step_id=1)
     yield from trainer.send(steps=100,
                             validation_freq=2,
                             scheduling_unit=5)
     tm, vm = trainer.result()
     yield workload.terminate_workload(
     ), [], workload.ignore_workload_response
 def make_workloads() -> workload.Stream:
     trainer = utils.TrainAndValidate(
         request_stop_step_id=request_stop_step_id)
     yield from trainer.send(steps=2,
                             validation_freq=2,
                             batches_per_step=5)
     tm, vm = trainer.result()
     yield workload.terminate_workload(
     ), [], workload.ignore_workload_response
        def make_workloads(tag: str) -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=1000, validation_freq=100)
            tm, vm = trainer.result()
            training_metrics[tag] = tm
            validation_metrics[tag] = vm

            yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=15,
                                    validation_freq=4,
                                    scheduling_unit=5)
            training_metrics, validation_metrics = trainer.result()

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()

            for metrics in training_metrics:
                assert "accuracy" in metrics

            yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #16
0
    def make_workloads(steps: int) -> workload.Stream:
        trainer = TrainAndValidate()

        yield from trainer.send(steps, validation_freq=1, scheduling_unit=10)
        tm, vm = trainer.result()
        metrics["training"] += tm
        metrics["validation"] += vm

        yield workload.terminate_workload(
        ), [], workload.ignore_workload_response
        def make_workloads_2() -> workload.Stream:
            interceptor = workload.WorkloadResponseInterceptor()

            yield from interceptor.send(workload.validation_workload(), [])
            metrics = interceptor.metrics_result()

            new_loss = metrics["metrics"]["validation_metrics"]["val_loss"]
            assert new_loss == pytest.approx(old_loss)

            yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=1000, validation_freq=100)
            training_metrics, validation_metrics = trainer.result()

            # We expect the validation error and training loss to be
            # monotonically decreasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] <= older["loss"]

            yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #19
0
    def make_workloads(tag: str) -> workload.Stream:
        nonlocal training_metrics
        nonlocal validation_metrics

        trainer = TrainAndValidate()

        yield from trainer.send(steps, validation_freq, batches_per_step=batches_per_step)
        tm, vm = trainer.result()

        training_metrics[tag] = tm
        validation_metrics[tag] = vm

        yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads_1() -> workload.Stream:
            nonlocal old_loss

            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()
            old_loss = validation_metrics[-1]["val_loss"]

            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response

            yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads(checkpoint_dir: str = "") -> workload.Stream:
            nonlocal training_metrics

            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=10, batches_per_step=1)
            tm, _ = trainer.result()
            training_metrics += tm

            if checkpoint_dir:
                yield workload.checkpoint_workload(), [
                    checkpoint_dir
                ], workload.ignore_workload_response

            yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #22
0
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=100, validation_freq=10)
            training_metrics, validation_metrics = trainer.result()

            # Check the gradient update at every step.
            for idx, batch_metrics in enumerate(training_metrics):
                pytorch_onevar_model.OneVarTrial.check_batch_metrics(batch_metrics, idx)

            # We expect the validation error and training loss to be
            # monotonically decreasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] <= older["loss"]

            yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #23
0
    def make_workloads(
        steps: int, tag: str, checkpoint_dir: Optional[pathlib.Path] = None
    ) -> workload.Stream:
        trainer = TrainAndValidate()

        yield from trainer.send(steps, validation_freq=1, batches_per_step=100)
        tm, vm = trainer.result()
        training_metrics[tag] += tm
        validation_metrics[tag] += vm

        if checkpoint_dir is not None:
            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response

        yield workload.terminate_workload(), [], workload.ignore_workload_response
Exemple #24
0
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            yield from trainer.send(steps=10, validation_freq=5, batches_per_step=1000)
            training_metrics, validation_metrics = trainer.result()

            # We expect the training loss to be monotonically decreasing and the
            # accuracy to be monotonically increasing.
            for older, newer in zip(training_metrics, training_metrics[1:]):
                assert newer["loss"] < older["loss"]

            for older, newer in zip(validation_metrics, validation_metrics[1:]):
                assert newer["accuracy"] >= older["accuracy"]

            # The final accuracy should be 100%.
            assert validation_metrics[-1]["accuracy"] == pytest.approx(1.0)

            yield workload.terminate_workload(), [], workload.ignore_workload_response
        def make_workloads() -> workload.Stream:
            trainer = utils.TrainAndValidate()

            # Test >1 validation to ensure that resetting the allgather_op list is working.
            yield from trainer.send(steps=2,
                                    validation_freq=1,
                                    batches_per_step=1)
            training_metrics, validation_metrics = trainer.result()

            for metrics in validation_metrics:
                assert metrics[
                    "label_sum_fn"] == estimator_linear_model.validation_label_sum(
                    )
                assert metrics[
                    "label_sum_cls"] == estimator_linear_model.validation_label_sum(
                    )

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
Exemple #26
0
def _make_test_workloads(checkpoint_dir: pathlib.Path,
                         config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    logging.info("Training one batch")
    yield from interceptor.send(workload.train_workload(1), [])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["metrics"]["batch_metrics"]
    check.eq(len(batch_metrics), config.scheduling_unit())
    logging.info(f"Finished training, metrics: {batch_metrics}")

    logging.info("Validating one batch")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["metrics"]["validation_metrics"]
    logging.info(f"Finished validating, validation metrics: {v_metrics}")

    logging.info(f"Saving a checkpoint to {checkpoint_dir}.")
    yield workload.checkpoint_workload(), [checkpoint_dir
                                           ], workload.ignore_workload_response
    logging.info(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    logging.info("The test experiment passed.")
Exemple #27
0
 def make_workloads2() -> workload.Stream:
     yield workload.terminate_workload(
     ), [], workload.ignore_workload_response