コード例 #1
0
        def make_workloads() -> workload.Stream:
            training_metrics = []
            interceptor = workload.WorkloadResponseInterceptor()

            total_steps, total_batches_processed = 10, 0
            for step_id in range(1, total_steps):
                num_batches = step_id
                yield from interceptor.send(
                    workload.train_workload(
                        step_id,
                        num_batches=num_batches,
                        total_batches_processed=total_batches_processed,
                    ),
                    [],
                )
                metrics = interceptor.metrics_result()
                batch_metrics = metrics["metrics"]["batch_metrics"]
                assert len(
                    batch_metrics
                ) == num_batches, "did not run for expected num_batches"
                training_metrics.extend(batch_metrics)
                total_batches_processed += num_batches

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
コード例 #2
0
        def make_workloads1() -> workload.Stream:
            nonlocal controller

            yield workload.train_workload(
                1, 1, 0), [], workload.ignore_workload_response
            assert controller is not None, "controller was never set!"
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 0,
                "validation_steps_ended": 0,
                "checkpoints_ended": 0,
            }

            yield workload.validation_workload(
            ), [], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 0,
            }

            yield workload.checkpoint_workload(), [
                checkpoint_dir
            ], workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 1,
            }

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
コード例 #3
0
def fake_workload_gen() -> Generator[workload.Workload, None, None]:
    # Generate some fake workloads.
    for i in range(NUM_FAKE_WORKLOADS - 1):
        yield workload.train_workload(i + 1,
                                      num_batches=1,
                                      total_batches_processed=i)
    yield workload.validation_workload(i)
コード例 #4
0
def make_test_workloads(
    checkpoint_dir: pathlib.Path, config: det.ExperimentConfig
) -> workload.Stream:
    print("Start training a test experiment.")
    interceptor = workload.WorkloadResponseInterceptor()

    print("Training 1 step.")
    yield from interceptor.send(workload.train_workload(1), [config.batches_per_step()])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["batch_metrics"]
    check.eq(len(batch_metrics), config.batches_per_step())
    print(f"Finished training. Metrics: {batch_metrics}")

    print("Validating.")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["validation_metrics"]
    print(f"Finished validating. Validation metrics: {v_metrics}")

    print(f"Saving a checkpoint to {checkpoint_dir}")
    yield workload.checkpoint_workload(), [checkpoint_dir], workload.ignore_workload_response
    print(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    print("The test experiment passed.")
コード例 #5
0
        def make_workloads() -> workload.Stream:
            nonlocal w
            interceptor = workload.WorkloadResponseInterceptor()

            for idx, batch in batches:
                yield from interceptor.send(workload.train_workload(1), [1])
                metrics = interceptor.metrics_result()

                # Calculate what the loss should be.
                loss = trial_class.calc_loss(w, batch)

                assert metrics["avg_metrics"]["loss"] == pytest.approx(loss)

                # Update what the weight should be.
                w = w - hparams["learning_rate"] * trial_class.calc_gradient(
                    w, batch)

                if test_checkpointing and idx == 3:
                    # Checkpoint and let the next TrialController finish the work.l
                    yield workload.checkpoint_workload(), [
                        checkpoint_dir
                    ], workload.ignore_workload_response
                    break

            yield workload.terminate_workload(
            ), [], workload.ignore_workload_response
コード例 #6
0
        def make_workloads() -> workload.Stream:
            nonlocal w
            interceptor = workload.WorkloadResponseInterceptor()

            for idx, batch in batches:
                yield from interceptor.send(workload.train_workload(1))
                metrics = interceptor.metrics_result()

                # Calculate what the loss should be.
                loss = trial_class.calc_loss(w, batch)

                epsilon = 0.0001
                assert abs(metrics["metrics"]["avg_metrics"]["loss"] -
                           loss) < epsilon

                # Update what the weight should be.
                w = w - hparams["learning_rate"] * trial_class.calc_gradient(
                    w, batch)

                if test_checkpointing and idx == 3:
                    # Checkpoint and let the next TrialController finish the work.
                    interceptor = workload.WorkloadResponseInterceptor()
                    yield from interceptor.send(workload.checkpoint_workload())
                    nonlocal latest_checkpoint, steps_completed
                    latest_checkpoint = interceptor.metrics_result()["uuid"]
                    # steps_completed is unused, but can't be 0.
                    steps_completed = 1
                    break
コード例 #7
0
    def send(
        self,
        steps: int,
        validation_freq: int,
        initial_step_id: int = 1,
        batches_per_step: int = 1,
    ) -> workload.Stream:
        self._training_metrics = []
        self._validation_metrics = []
        interceptor = workload.WorkloadResponseInterceptor()

        for step_id in range(initial_step_id, initial_step_id + steps):
            yield from interceptor.send(workload.train_workload(step_id),
                                        [batches_per_step])
            metrics = interceptor.metrics_result()
            batch_metrics = metrics["batch_metrics"]
            assert len(batch_metrics) == batches_per_step
            self._training_metrics.extend(batch_metrics)

            if step_id % validation_freq == 0:
                yield from interceptor.send(
                    workload.validation_workload(step_id), [])
                validation = interceptor.metrics_result()
                v_metrics = validation["validation_metrics"]
                self._validation_metrics.append(v_metrics)
コード例 #8
0
def _make_local_execution_env(
    managed_training: bool,
    test_mode: bool,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
    limit_gpus: Optional[int] = None,
) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]:
    config = det.ExperimentConfig(
        _make_local_execution_exp_config(config,
                                         managed_training=managed_training,
                                         test_mode=test_mode))
    hparams = hparams or api.generate_random_hparam_values(
        config.get("hyperparameters", {}))
    use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus)
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.scheduling_unit()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="",
        det_experiment_id="",
        det_cluster_id="",
        trial_seed=config.experiment_seed(),
        managed_training=managed_training,
        test_mode=test_mode,
        on_cluster=False,
    )
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, rendezvous_info, hvd_config
コード例 #9
0
        def make_workloads1() -> workload.Stream:
            nonlocal controller
            assert controller.trial.counter.trial_startups == 1

            yield workload.train_workload(1, 1, 0,
                                          4), workload.ignore_workload_response
            assert controller is not None, "controller was never set!"
            assert controller.trial.counter.__dict__ == {
                "trial_startups": 1,
                "validation_steps_started": 0,
                "validation_steps_ended": 0,
                "checkpoints_ended": 0,
                "training_started_times": 1,
                "training_epochs_started": 2,
                "training_epochs_ended": 2,
                "trial_shutdowns": 0,
            }
            assert controller.trial.legacy_counter.__dict__ == {
                "legacy_on_training_epochs_start_calls": 2
            }

            yield workload.validation_workload(
            ), workload.ignore_workload_response
            assert controller.trial.counter.__dict__ == {
                "trial_startups": 1,
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 0,
                "training_started_times": 1,
                "training_epochs_started": 2,
                "training_epochs_ended": 2,
                "trial_shutdowns": 0,
            }
            assert controller.trial.legacy_counter.__dict__ == {
                "legacy_on_training_epochs_start_calls": 2
            }

            interceptor = workload.WorkloadResponseInterceptor()
            yield from interceptor.send(workload.checkpoint_workload())
            nonlocal latest_checkpoint, steps_completed
            latest_checkpoint = interceptor.metrics_result()["uuid"]
            steps_completed = 1
            assert controller.trial.counter.__dict__ == {
                "trial_startups": 1,
                "validation_steps_started": 1,
                "validation_steps_ended": 1,
                "checkpoints_ended": 1,
                "training_started_times": 1,
                "training_epochs_started": 2,
                "training_epochs_ended": 2,
                "trial_shutdowns": 0,
            }
            assert controller.trial.legacy_counter.__dict__ == {
                "legacy_on_training_epochs_start_calls": 2
            }
コード例 #10
0
    def send(self,
             steps: int,
             validation_freq: int,
             initial_step_id: int = 1,
             scheduling_unit: int = 1) -> workload.Stream:
        self._training_metrics = []
        self._avg_training_metrics = []
        self._validation_metrics = []
        total_batches_processed = 0
        interceptor = workload.WorkloadResponseInterceptor()

        for step_id in range(initial_step_id, initial_step_id + steps):
            stop_requested = False
            yield from interceptor.send(
                workload.train_workload(
                    step_id,
                    num_batches=scheduling_unit,
                    total_batches_processed=total_batches_processed,
                ),
                [],
            )
            metrics = interceptor.metrics_result()
            batch_metrics = metrics["metrics"]["batch_metrics"]
            assert len(batch_metrics) == scheduling_unit
            self._training_metrics.extend(batch_metrics)
            self._avg_training_metrics.append(
                metrics["metrics"]["avg_metrics"])
            total_batches_processed += scheduling_unit
            if metrics["stop_requested"]:
                assert step_id == self.request_stop_step_id
                stop_requested = True

            if step_id % validation_freq == 0:
                yield from interceptor.send(
                    workload.validation_workload(
                        step_id,
                        total_batches_processed=total_batches_processed),
                    [],
                )
                validation = interceptor.metrics_result()
                v_metrics = validation["metrics"]["validation_metrics"]
                self._validation_metrics.append(v_metrics)
                if validation["stop_requested"]:
                    assert step_id == self.request_stop_step_id
                    stop_requested = True

            if stop_requested:
                break
            else:
                assert step_id != self.request_stop_step_id
コード例 #11
0
ファイル: _native.py プロジェクト: AustinInBoston/determined
def _make_local_test_experiment_env(
    checkpoint_dir: pathlib.Path,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo,
           horovod.HorovodContext]:
    config = det.ExperimentConfig(_make_local_test_experiment_config(config))
    hparams = hparams or _generate_test_hparam_values(config)
    use_gpu, container_gpus, slot_ids = _get_gpus()
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=1,
        container_id="test_mode",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.batches_per_step()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="test_mode",
        trial_seed=config.experiment_seed(),
    )
    workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"),
                                     config)
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, workloads, rendezvous_info, hvd_config
コード例 #12
0
def _make_test_workloads(config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    logging.info("Training one batch")
    yield from interceptor.send(workload.train_workload(1))
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["metrics"]["batch_metrics"]
    check.eq(len(batch_metrics), config.scheduling_unit())
    logging.info(f"Finished training, metrics: {batch_metrics}")

    logging.info("Validating one batch")
    yield from interceptor.send(workload.validation_workload(1))
    validation = interceptor.metrics_result()
    v_metrics = validation["metrics"]["validation_metrics"]
    logging.info(f"Finished validating, validation metrics: {v_metrics}")

    logging.info("Saving a checkpoint.")
    yield workload.checkpoint_workload(), workload.ignore_workload_response
    logging.info("Finished saving a checkpoint.")
コード例 #13
0
ファイル: utils.py プロジェクト: brainhart/determined
    def send(self,
             steps: int,
             validation_freq: int,
             initial_step_id: int = 1,
             batches_per_step: int = 1) -> workload.Stream:
        self._training_metrics = []
        self._validation_metrics = []
        interceptor = workload.WorkloadResponseInterceptor()

        for step_id in range(initial_step_id, initial_step_id + steps):
            stop_requested = False
            yield from interceptor.send(workload.train_workload(step_id),
                                        [batches_per_step])
            metrics = interceptor.metrics_result()
            batch_metrics = metrics["metrics"]["batch_metrics"]
            assert len(batch_metrics) == batches_per_step
            self._training_metrics.extend(batch_metrics)
            if metrics["stop_requested"]:
                assert step_id == self.request_stop_step_id
                stop_requested = True

            if step_id % validation_freq == 0:
                yield from interceptor.send(
                    workload.validation_workload(step_id), [])
                validation = interceptor.metrics_result()
                print(validation)
                v_metrics = validation["metrics"]["validation_metrics"]
                self._validation_metrics.append(v_metrics)
                if validation["stop_requested"]:
                    assert step_id == self.request_stop_step_id
                    stop_requested = True

            if stop_requested:
                break
            else:
                assert step_id != self.request_stop_step_id
コード例 #14
0
def _make_test_workloads(checkpoint_dir: pathlib.Path,
                         config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    logging.info("Training one batch")
    yield from interceptor.send(workload.train_workload(1), [])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["metrics"]["batch_metrics"]
    check.eq(len(batch_metrics), config.scheduling_unit())
    logging.debug(f"Finished training, metrics: {batch_metrics}")

    logging.info("Validating one step")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["metrics"]["validation_metrics"]
    logging.debug(f"Finished validating, validation metrics: {v_metrics}")

    logging.info(f"Saving a checkpoint to {checkpoint_dir}.")
    yield workload.checkpoint_workload(), [checkpoint_dir
                                           ], workload.ignore_workload_response
    logging.info(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    logging.info("The test experiment passed.")
コード例 #15
0
 def make_workloads() -> workload.Stream:
     yield workload.train_workload(
         1, num_batches=100), [], workload.ignore_workload_response
     yield workload.checkpoint_workload(), [], checkpoint_response_func
コード例 #16
0
 def make_workloads() -> workload.Stream:
     yield workload.train_workload(
         1, num_batches=100), [], workload.ignore_workload_response
     yield workload.validation_workload(
     ), [], workload.ignore_workload_response
コード例 #17
0
def fake_workload_gen() -> Generator[workload.Workload, None, None]:
    # Generate some fake workloads.
    for i in range(NUM_FAKE_WORKLOADS - 1):
        yield workload.train_workload(i + 1)
    yield workload.validation_workload(i)
コード例 #18
0
 def make_workloads2() -> workload.Stream:
     yield workload.train_workload(1, 1,
                                   0), workload.ignore_workload_response