def checkpoint(self, already_exiting: bool) -> WorkloadGenerator:
        self.core_context.train.set_status("checkpointing")

        # Update the last_ckpt now so it can be captured by get_state() after we yield.
        self.state.last_ckpt = self.state.steps_completed

        wkld = workload.Workload(
            kind=workload.Workload.Kind.CHECKPOINT_MODEL,
            e_id=self._exp_id,
            t_id=self._trial_id,
            s_id=self.state.step_id,
            num_batches=0,
            total_batches_processed=self.state.steps_completed,
        )
        response = yield from yield_and_await_response(wkld)

        if isinstance(response, workload.InvalidHP):
            self.core_context.train.report_early_exit(
                core.EarlyExitReason.INVALID_HP)
            if not already_exiting:
                raise ShouldExit(skip_exit_checkpoint=True)
            return

        if already_exiting:
            return

        if response.get("stop_requested"):
            raise ShouldExit()

        self.check_for_preemption()
Esempio n. 2
0
def make_default_env_context(
    hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0
) -> det.EnvContext:
    if experiment_config is None:
        experiment_config = make_default_exp_config(hparams, 1)

    # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables,
    # and we can get rid of the @expose_gpus fixture.
    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu)

    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1)
        ),
        master_addr="",
        master_port=0,
        container_id="",
        hparams=hparams,
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=gpu_uuids,
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        det_rendezvous_ports="",
        det_trial_runner_network_interface=constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=trial_seed,
    )
Esempio n. 3
0
def create_default_env_context(
        experiment_config: Dict[str, Any]) -> det.EnvContext:
    det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE
    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            ExperimentID(1),
            TrialID(1),
            StepID(1),
            det.ExperimentConfig(experiment_config).scheduling_unit(),
            0,
        ),
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        hparams={"global_batch_size": 32},
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        det_rendezvous_ports="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=det_trial_runner_network_interface,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
    )
Esempio n. 4
0
def get_dummy_env() -> det.EnvContext:
    return det.EnvContext(
        master_addr="",
        master_port=0,
        container_id="",
        experiment_config={
            "resources": {
                "slots_per_trial": 1,
                "native_parallel": False
            }
        },
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            determined_common.types.ExperimentID(1),
            determined_common.types.TrialID(1),
            determined_common.types.StepID(1),
        ),
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        hparams={"global_batch_size": 1},
        det_rendezvous_ports="",
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
    )
    def train(self, num_batches: int,
              op: core.SearcherOperation) -> WorkloadGenerator:
        # Report a train step is starting.
        self.core_context.train.set_status("training")

        wkld = workload.Workload(
            kind=workload.Workload.Kind.RUN_STEP,
            e_id=self._exp_id,
            t_id=self._trial_id,
            s_id=self.state.step_id + 1,
            num_batches=num_batches,
            total_batches_processed=self.state.steps_completed,
        )

        response = yield from yield_and_await_response(wkld)

        # Train step is complete, process the result.

        if isinstance(response, workload.InvalidHP):
            # Exit before reporting metrics (which would be empty anyway).
            self.core_context.train.report_early_exit(
                core.EarlyExitReason.INVALID_HP)
            raise ShouldExit()

        metrics = response.get("metrics", {}).get("avg_metrics", {})
        batch_metrics = response.get("metrics", {}).get("batch_metrics", [])

        self.state.steps_completed += num_batches
        self.state.step_id += 1
        self.core_context.train.report_training_metrics(
            steps_completed=self.state.steps_completed,
            metrics=metrics,
            batch_metrics=batch_metrics,
        )

        # Report progress to the searcher.  For historical reasons we only deal in batches.
        if self._unit == core.Unit.BATCHES:
            op.report_progress(self.state.steps_completed)
        elif self._unit == core.Unit.RECORDS:
            op.report_progress(self.global_batch_size *
                               self.state.steps_completed)
        elif self._unit == core.Unit.EPOCHS:
            op.report_progress(self.state.steps_completed /
                               self.as_batches(epochs=1))
        else:
            raise ValueError(f"unrecognized searcher op unit: {self._unit}")

        if response.get("stop_requested"):
            # Exit after reporting metrics.
            raise ShouldExit()

        self.check_for_preemption()
Esempio n. 6
0
def get_dummy_env() -> det.EnvContext:
    return det.EnvContext(
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        experiment_config={
            "resources": {
                "slots_per_trial": 1,
                "native_parallel": False
            }
        },
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            determined.common.types.ExperimentID(1),
            determined.common.types.TrialID(1),
            determined.common.types.StepID(1),
            constants.DEFAULT_SCHEDULING_UNIT,
            0,
        ),
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        hparams={"global_batch_size": 1},
        det_rendezvous_port="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_agent_id="1",
        det_experiment_id="1",
        det_task_token="",
        det_cluster_id="uuid-123",
        trial_seed=0,
        managed_training=True,
        test_mode=False,
        on_cluster=False,
    )
    def validate(self,
                 op: Optional[core.SearcherOperation]) -> WorkloadGenerator:
        # Report a validation step is starting.
        self.core_context.train.set_status("validating")

        wkld = workload.Workload(
            kind=workload.Workload.Kind.COMPUTE_VALIDATION_METRICS,
            e_id=self._exp_id,
            t_id=self._trial_id,
            s_id=self.state.step_id,
            num_batches=0,
            total_batches_processed=self.state.steps_completed,
        )

        response = yield from yield_and_await_response(wkld)

        # Validation step is complete, process the result.

        if isinstance(response, workload.InvalidHP):
            self.core_context.train.report_early_exit(
                core.EarlyExitReason.INVALID_HP)
            raise ShouldExit()

        metrics = response["metrics"]["validation_metrics"]

        # Check that the validation metrics computed by the model code
        # includes the metric used by the search method.
        searcher_metric_name = self.env.experiment_config["searcher"]["metric"]
        if searcher_metric_name not in metrics:
            raise RuntimeError(
                f"Search method is configured to use metric '{searcher_metric_name}' but model "
                f"definition returned validation metrics {list(metrics.keys())}. The metric "
                "used by the search method must be one of the validation "
                "metrics returned by the model definition.")

        # Check that the searcher metric has a scalar value so that it can be compared for
        # search purposes. Other metrics don't have to be scalars.
        searcher_metric = metrics[searcher_metric_name]
        if not tensorboard.metric_writers.util.is_numerical_scalar(
                searcher_metric):
            raise RuntimeError(
                f"Searcher validation metric '{searcher_metric_name}' returned "
                f"a non-scalar value: {searcher_metric}")

        # Report to the searcher API first, so we don't end up in a situation where we die between
        # reporting to the metrics API and when we come back we refuse to repeat a validation, but
        # we also don't have any validation metrics to report the the searcher API.
        #
        # A simpler solution here would be to execute in the following order (which would be
        # suitable for most customers to implement on their own):
        #   - validation
        #   - report to metrics API
        #   - report to searcher API
        #   - checkpoint
        #
        # But we can't do that without breaking behavior.
        if op is not None and self.batches_until_op_complete(op) < 1:
            op.report_completed(searcher_metric)

        if self.ckpt_policy == "best" and not self.checkpoint_is_current():
            # Before reporting our own validation metric, check what the best known validation is
            # without it.
            best_validation_before = self.core_context.train.get_experiment_best_validation(
            )

        self.state.last_val = self.state.steps_completed
        self.core_context.train.report_validation_metrics(
            steps_completed=self.state.steps_completed,
            metrics=metrics,
        )

        if response.get("stop_requested"):
            raise ShouldExit()

        if not self.checkpoint_is_current():
            if self.ckpt_policy == "all" or (
                    self.ckpt_policy == "best" and self.is_best_validation(
                        now=searcher_metric, before=best_validation_before)):
                yield from self.checkpoint(already_exiting=False)

        self.check_for_preemption()