Esempio n. 1
0
    def _init_run_config(
            self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig:
        logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .")

        session_config = config.session_config
        train_distribute = None
        eval_distribute = None

        # The default session should already be defined, here we also set the session
        # for the estimator itself.
        self._init_session_config(session_config, self.env, self.hvd_config)

        if not self.hvd_config.use and len(self.env.container_gpus) > 1:
            check.true(len(self.rendezvous_info.get_addrs()) == 1)
            train_distribute = tf.distribute.MirroredStrategy()
            eval_distribute = tf.distribute.MirroredStrategy()

        config = config.replace(
            model_dir=str(self.estimator_dir),
            tf_random_seed=self.env.trial_seed,
            save_checkpoints_steps=None,
            # `train_and_evaluate()` requires that either
            # `save_checkpoints_steps` or `save_checkpoints_secs` is
            # set to greater than 0.
            save_checkpoints_secs=VERY_LARGE_NUMBER,
            session_config=session_config,
            train_distribute=train_distribute,
            eval_distribute=eval_distribute,
            experimental_distribute=None,
        )
        logging.debug(f"Initialized RunConfig with args: {config}.")
        return config
Esempio n. 2
0
 def average_metrics(self, metrics: Dict[str,
                                         Any]) -> Optional[Dict[str, Any]]:
     check.true(self.hvd_config.use)
     if self.is_chief:
         self.train_process_comm_chief = cast(ipc.ZMQBroadcastServer,
                                              self.train_process_comm_chief)
         logging.debug(
             f"Chief {hvd.rank()} beginning receiving validation metrics.")
         worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(
             lambda: None)
         self.train_process_comm_chief.broadcast(None)
         logging.debug(
             f"Chief {hvd.rank()} done receiving validation metrics.")
         for metric_name in metrics:
             if isinstance(metrics[metric_name], numbers.Number):
                 metrics[metric_name] /= hvd.size()
             else:
                 logging.warning(
                     f"Skipping averaging metric: {metric_name}.")
         for metric_name in metrics.keys():
             for worker_metric in worker_metrics:
                 if isinstance(worker_metric[metric_name], numbers.Number):
                     metrics[metric_name] += worker_metric[
                         metric_name] / hvd.size()
         return metrics
     else:
         self.train_process_comm_worker = cast(
             ipc.ZMQBroadcastClient, self.train_process_comm_worker)
         logging.debug(f"Worker {hvd.rank()} sending metrics.")
         self.train_process_comm_worker.send(metrics)
         # Synchronize with the chief so that there is no risk of accidentally calling send()
         # for a future gather before all workers have called send() on this gather.
         _ = self.train_process_comm_worker.recv()
         return None
 def set_runpy_trial_result(
     cls, trial_cls: Type[det.Trial], controller_cls: Type[det.TrialController]
 ) -> None:
     check.true(cls.get_instance().controller_cls is None, "Please don't load twice.")
     cls.get_instance().trial_cls = trial_cls
     cls.get_instance().controller_cls = controller_cls
     raise det.errors.StopLoadingImplementation()
Esempio n. 4
0
    def wrap_scaler(self, scaler: Any) -> Any:
        """
        Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned
        scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from
        vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should
        be called before clipping gradients, ``update`` should be called after stepping all
        optimizers, etc.

        PyTorch 1.6 or greater is required for this feature.

        Arguments:
            scaler (``torch.cuda.amp.GradScaler``):  Scaler to wrap and track.

        Returns:
            The scaler. It may be wrapped to add additional functionality for use in Determined.
        """

        check.false(amp_import_error, "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.")

        check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.")

        check.is_none(self._scaler, "Please only call wrap_scaler or use_amp once.")

        check.true(len(self.models) == 0, "Please call wrap_scaler before wrap_model.")

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        self._scaler = scaler

        return scaler
    def _initialize_train_process_comm(self) -> None:
        check.true(self.hvd_config.use)

        srv_pub_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_1 +
                        self.env.det_trial_unique_port_offset)
        srv_pull_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_2 +
                         self.env.det_trial_unique_port_offset)

        if self.is_chief:
            logging.debug(
                f"Chief setting up server with ports {srv_pub_port}/{srv_pull_port}."
            )
            self.train_process_comm_chief = ipc.ZMQBroadcastServer(
                num_connections=self.env.experiment_config.slots_per_trial() -
                1,
                pub_port=srv_pub_port,
                pull_port=srv_pull_port,
            )
        else:
            chief_ip_address = self.rendezvous_info.get_ip_addresses()[0]
            logging.debug(f"Non-Chief {hvd.rank()} setting up comm to "
                          f"{chief_ip_address} w/ ports "
                          f"{srv_pub_port}/{srv_pull_port}.")
            self.train_process_comm_worker = ipc.ZMQBroadcastClient(
                srv_pub_url=f"tcp://{chief_ip_address}:{srv_pub_port}",
                srv_pull_url=f"tcp://{chief_ip_address}:{srv_pull_port}",
            )
 def set_runpy_native_result(
         cls, context: det.NativeContext,
         controller_cls: Type[det.TrialController]) -> None:
     check.true(cls.get_instance().controller_cls is None,
                "Please don't load twice.")
     cls.get_instance().context = context
     cls.get_instance().controller_cls = controller_cls
Esempio n. 7
0
    def after_run(self, run_context: tf.estimator.SessionRunContext,
                  run_values: tf.estimator.SessionRunValues) -> None:
        # Check for optimizer creation here because when model_fn is passed in as a closure,
        # the optimizer is not initialized until the first training step.
        check.true(
            self.estimator_trial_controller.context.optimizer_initialized,
            "Please pass your optimizer into "
            "`det.estimator.wrap_optimizer(optimizer)` "
            "right after creating it.",
        )
        self._session = run_context.session
        self._current_global_step = int(run_values.results["global_step"])

        self.num_batches = cast(int, self.num_batches)
        self._collect_batch_metrics(run_values)
        self.batches_processed_in_step += 1
        if self.batches_processed_in_step < self.num_batches:
            return

        # TODO: Average training results across GPUs. This might
        # degrade performance due to an increase in communication.

        # Loss training metric is sometimes called `loss_1` instead of `loss`.
        for step_metrics in self.step_metrics:
            if "loss" not in step_metrics and "loss_1" in step_metrics:
                step_metrics["loss"] = step_metrics["loss_1"]

        # Send the result of the training step back to the main process.
        check.is_not_none(self.train_response_func,
                          "no response_func at end of train_for_step")
        self.train_response_func = cast(workload.ResponseFunc,
                                        self.train_response_func)
        if self.estimator_trial_controller.is_chief:
            response = {
                "metrics":
                det.util.make_metrics(self.batches_processed_in_step,
                                      self.step_metrics),
                "stop_requested":
                self.estimator_trial_controller.context.get_stop_requested(),
                "invalid_hp":
                False,
                "init_invalid_hp":
                False,
            }
            self.train_response_func(response)
        else:
            self.train_response_func(workload.Skipped())

        # Reset step counter and clear the step metrics from memory.
        self.train_response_func = None
        self.batches_processed_in_step = 0
        self.step_metrics = []

        estimator._cleanup_after_train_step(
            self.estimator_trial_controller.estimator_dir)

        # Re-enter the control loop (block on receiving the next instruction)
        self.control_loop()
Esempio n. 8
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran",
                                                      False)
        if check_startup_hook_ran:
            check.true(os.path.isfile("startup-hook-ran"),
                       "File should exists.")

        self.chaos = random.SystemRandom()
        self._batch_size = self.context.get_per_slot_batch_size()
        self.chaos_probability = self.env.hparams.get("chaos_probability", 0)
        self.chaos_probability_train = self.env.hparams.get(
            "chaos_probability_train")
        self.chaos_probability_validate = self.env.hparams.get(
            "chaos_probability_validate")
        self.chaos_probability_checkpoint = self.env.hparams.get(
            "chaos_probability_checkpoint")
        self.fail_on_first_validation = self.env.hparams.get(
            "fail_on_first_validation", "")
        self.fail_on_chechpoint_save = self.env.hparams.get(
            "fail_on_chechpoint_save", "")
        self.validation_set_size = self.env.hparams.get(
            "validation_set_size", 32 * 32)
        self.train_batch_secs = self.env.hparams.get("training_batch_seconds",
                                                     0)
        self.validation_secs = self.env.hparams.get(
            "validation_seconds",
            self.validation_set_size * self.train_batch_secs /
            self._batch_size,
        )
        self.num_training_metrics = self.env.hparams.get(
            "num_training_metrics", 1)
        assert self.num_training_metrics > 0
        self.num_validation_metrics = self.env.hparams.get(
            "num_validation_metrics", 1)
        assert self.num_validation_metrics > 0
        self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0)
        self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0)
        self.metrics_progression = self.env.hparams.get(
            "metrics_progression", "decreasing")
        assert self.metrics_progression in ("increasing", "decreasing",
                                            "constant")
        self.metrics_base = self.env.hparams.get("metrics_base", 0.9)
        assert 0 < self.metrics_base < 1
        self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0)
        assert 0 <= self.metrics_sigma
        self.write_null = self.env.hparams.get("write_null", False)

        self.request_stop = self.env.hparams.get("request_stop", False)

        if self.load_path is None:
            self.trained_steps = collections.Counter()
        else:
            self.load(self.load_path)
Esempio n. 9
0
    def __init__(self, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran", False)
        if check_startup_hook_ran:
            check.true(os.path.isfile("startup-hook-ran"), "File should exists.")

        self.chaos = random.SystemRandom()
        self._batch_size = self.context.get_per_slot_batch_size()
        self.chaos_probability = self.env.hparams.get("chaos_probability", 0)
        self.chaos_probability_train = self.env.hparams.get("chaos_probability_train")
        self.chaos_probability_validate = self.env.hparams.get("chaos_probability_validate")
        self.chaos_probability_checkpoint = self.env.hparams.get("chaos_probability_checkpoint")
        self.nan_probability_validate = self.env.hparams.get("nan_probability_validate", 0)
        self.fail_on_first_validation = self.env.hparams.get("fail_on_first_validation", "")
        self.fail_on_chechpoint_save = self.env.hparams.get("fail_on_chechpoint_save", "")
        self.validation_set_size = self.env.hparams.get("validation_set_size", 32 * 32)
        self.train_batch_secs = self.env.hparams.get("training_batch_seconds", 0)
        self.validation_secs = self.env.hparams.get(
            "validation_seconds",
            self.validation_set_size * self.train_batch_secs / self._batch_size,
        )
        self.num_training_metrics = self.env.hparams.get("num_training_metrics", 1)
        assert self.num_training_metrics > 0
        self.num_validation_metrics = self.env.hparams.get("num_validation_metrics", 1)
        assert self.num_validation_metrics > 0
        self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0)
        self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0)
        self.metrics_progression = self.env.hparams.get("metrics_progression", "decreasing")
        assert self.metrics_progression in ("increasing", "decreasing", "constant")
        self.metrics_base = self.env.hparams.get("metrics_base", 0.9)
        assert 0 < self.metrics_base < 1
        self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0)
        assert 0 <= self.metrics_sigma
        self.write_null = self.env.hparams.get("write_null", False)

        self.request_stop = self.env.hparams.get("request_stop", False)

        self.non_chief_exit_immediately = self.env.hparams.get("non_chief_exit_immediately", False)

        self.wlsq = None
        if self.workloads is None:
            self.workloads, self.wlsq = layers.make_compatibility_workloads(
                self.context._core, self.env, self.context.get_global_batch_size()
            )

        self.steps_completed = self.env.steps_completed

        if self.env.latest_checkpoint is not None:
            with self.context._core.checkpoint.restore_path(
                self.env.latest_checkpoint
            ) as load_path:
                self.load(pathlib.Path(load_path))
        else:
            self.trained_steps = collections.Counter()
 def get_runpy_result(
     cls,
 ) -> Tuple[Optional[det.NativeContext], Optional[Type[det.Trial]], Type[det.TrialController]]:
     check.true(
         cls.get_instance().controller_cls is not None, "Please load native implementation."
     )
     return (
         cls.get_instance().context,
         cls.get_instance().trial_cls,
         cast(Type[det.TrialController], cls.get_instance().controller_cls),
     )
Esempio n. 11
0
    def average_metrics(self, metrics: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        check.true(self.hvd_config.use)
        all_metrics = self.context.distributed._zmq_gather(metrics)
        if not self.is_chief:
            return None
        assert all_metrics is not None, "chief did not get metrics from _zmq_gather()"

        for key in metrics:
            if isinstance(metrics[key], numbers.Number):
                metrics[key] = sum(m[key] for m in all_metrics) / hvd.size()
            else:
                logging.warning(f"Skipping averaging metric: {key}.")
        return metrics
def load_native_implementation_controller(
    env: det.EnvContext,
    workloads: workload.Stream,
    load_path: Optional[pathlib.Path],
    rendezvous_info: det.RendezvousInfo,
    hvd_config: horovod.HorovodContext,
) -> det.TrialController:
    check.true(
        env.experiment_config.native_enabled(),
        "Experiment configuration does not have an internal.native "
        f"configuration: {env.experiment_config}",
    )

    context, trial_class, controller_class = load.load_native_implementation(
        env, hvd_config)

    if trial_class is not None:
        return load_controller_from_trial(
            trial_class=trial_class,
            env=env,
            workloads=workloads,
            load_path=load_path,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )

    else:
        # Framework-specific native implementation.
        check.is_not_none(
            controller_class,
            "The class attribute `trial_controller_class` is "
            "None; please set it the correct subclass of `det.TrialController`",
        )
        check.is_subclass(
            controller_class,
            det.TrialController,
            "The class attribute `trial_controller_class` is "
            "not a valid subclass of `det.TrialController`",
        )
        logging.info(
            f"Creating {controller_class.__name__} with {type(context).__name__}."
        )
        return cast(det.TrialController, controller_class).from_native(
            context=cast(det.NativeContext, context),
            env=env,
            workloads=workloads,
            load_path=load_path,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )
Esempio n. 13
0
 def restore_path(self, src: str) -> Iterator[pathlib.Path]:
     """
     Prepare a local directory exposing the checkpoint. Do some simple checks to make sure the
     configuration seems reasonable.
     """
     check.true(
         os.path.exists(self._base_path),
         f"Storage directory does not exist: {self._base_path}. Please verify that you are "
         "using the correct configuration value for checkpoint_storage.host_path",
     )
     storage_dir = os.path.join(self._base_path, src)
     if not os.path.exists(storage_dir):
         raise errors.CheckpointNotFound(
             f"Did not find checkpoint {src} in shared_fs storage")
     yield pathlib.Path(storage_dir)
def convert_notebook_to_python_script(notebook_path: str) -> str:
    check.check_true(
        notebook_path.endswith(".ipynb"), f"Notebook file {notebook_path} must has a suffix .ipynb"
    )
    processed_cells_path = f"{notebook_path[:-6]}__det__.py"

    with open(notebook_path, "r") as f1, open(processed_cells_path, "w") as f2:
        obj = json.load(f1)
        check.true("cells" in obj, f"Invalid notebook file {notebook_path}")
        for cell in obj["cells"]:
            if cell["cell_type"] == "code":
                lines = [line for line in cell["source"] if not line.lstrip().startswith("!")]
                f2.writelines(lines)
                f2.write("\n")
    return processed_cells_path
Esempio n. 15
0
 def restore_path(self, metadata: StorageMetadata) -> Iterator[str]:
     """
     Prepare a local directory exposing the checkpoint. Do some simple checks to make sure the
     configuration seems reasonable.
     """
     storage_dir = os.path.join(self._base_path, metadata.storage_id)
     check.true(
         os.path.exists(storage_dir),
         "Storage directory does not exist: {}. Please verify "
         "that you are using the correct configuration value for "
         "checkpoint_storage.host_path".format(storage_dir),
     )
     check.true(
         os.path.isdir(storage_dir),
         "Checkpoint path is not a directory: {}".format(storage_dir))
     yield storage_dir
Esempio n. 16
0
    def _average_training_metrics(
            self, per_batch_metrics: List[Dict[str,
                                               Any]]) -> List[Dict[str, Any]]:
        """Average training metrics across GPUs"""
        check.true(self.hvd_config.use,
                   "Can only average training metrics in multi-GPU training.")
        metrics_timeseries = util._list_to_dict(per_batch_metrics)

        # combined_timeseries is: dict[metric_name] -> 2d-array.
        # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx].
        combined_timeseries, _ = self._combine_metrics_across_processes(
            metrics_timeseries, num_batches=len(per_batch_metrics))

        # If the value for a metric is a single-element array, the averaging process will
        # change that into just the element. We record what metrics are single-element arrays
        # so we can wrap them in an array later (for perfect compatibility with non-averaging
        # codepath).
        array_metrics = []
        for metric_name in per_batch_metrics[0].keys():
            if isinstance(per_batch_metrics[0][metric_name], np.ndarray):
                array_metrics.append(metric_name)

        if self.is_chief:
            combined_timeseries_type = Dict[str, List[List[Any]]]
            combined_timeseries = cast(combined_timeseries_type,
                                       combined_timeseries)
            num_batches = len(per_batch_metrics)
            num_processes = hvd.size()
            averaged_metrics_timeseries = {}  # type: Dict[str, List]

            for metric_name in combined_timeseries.keys():
                averaged_metrics_timeseries[metric_name] = []
                for batch_idx in range(num_batches):
                    batch = [
                        combined_timeseries[metric_name][process_idx]
                        [batch_idx] for process_idx in range(num_processes)
                    ]

                    np_batch = np.array(batch)
                    batch_avg = np.mean(
                        np_batch[np_batch != None])  # noqa: E711
                    if metric_name in array_metrics:
                        batch_avg = np.array(batch_avg)
                    averaged_metrics_timeseries[metric_name].append(batch_avg)
            per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries)
        return per_batch_metrics
Esempio n. 17
0
    def _combine_metrics_across_processes(
        self, metrics: Dict[str, Any], num_batches: int
    ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]:
        # The chief receives the metric from every other training process.
        check.true(self.hvd_config.use)

        # all_args is a list of [(metrics, num_batches), ...] for each worker.
        all_args = self.context.distributed._zmq_gather((metrics, num_batches))

        if not self.is_chief:
            return None, None

        # Reshape so e.g. all_metrics = [metrics, metrics, ...].
        all_metrics, all_num_batches = zip(*all_args)

        # convert all_metrics from List[Dict[str, Any]] to Dict[str, List[Any]].
        metrics_lists = {key: [m[key] for m in all_metrics] for key in metrics}

        return metrics_lists, all_num_batches
Esempio n. 18
0
    def _launch_horovodrun(self) -> subprocess.Popen:
        check.true(self.hvd_config.use)
        logging.debug(f"Starting training process on: {self.rendezvous_info.get_rank()}.")

        horovod_process_cmd = horovod.create_run_command(
            num_proc_per_machine=self.num_proc,
            ip_addresses=self.rendezvous_info.get_ip_addresses(),
            env=self.env,
            debug=self.env.experiment_config.debug_enabled(),
            optional_args=self.env.experiment_config.horovod_optional_args(),
            worker_process_env_path=self._worker_process_env_path,
        )
        subprocess_env = {
            **os.environ,
            "NCCL_DEBUG": "INFO",
            "DET_HOROVOD_GLOO_RENDEZVOUS_PORT": str(
                constants.HOROVOD_GLOO_RENDEZVOUS_PORT + self.env.det_trial_unique_port_offset
            ),
        }
        return subprocess.Popen(horovod_process_cmd, env=subprocess_env)
Esempio n. 19
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE)

    # Wait for the only trial to get scheduled.
    exp.wait_for_experiment_active_workload(experiment_id)

    # Wait for the only trial to show progress, indicating the image is built and running.
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED)

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active,
        "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(
        experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
    )
Esempio n. 20
0
def load_native(
    env: det.EnvContext,
    workloads: workload.Stream,
    load_path: Optional[pathlib.Path],
    rendezvous_info: det.RendezvousInfo,
    hvd_config: horovod.HorovodContext,
) -> det.TrialController:
    check.true(
        env.experiment_config.native_enabled(),
        "Experiment configuration does not have an internal.native "
        f"configuration: {env.experiment_config}",
    )

    trial_class = get_trial_from_native(env, hvd_config, rendezvous_info)
    return load.load_trial(
        trial_class=trial_class,
        env=env,
        workloads=workloads,
        load_path=load_path,
        rendezvous_info=rendezvous_info,
        hvd_config=hvd_config,
    )
Esempio n. 21
0
        def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset:
            ds = f(*args, **kwargs)

            if self.context.experimental.get_train_cacheable().is_decorator_used():
                check.false(
                    self.context.dataset_initialized,
                    "Please do not use: `context.wrap_dataset(dataset)` if using "
                    "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` "
                    "and `@context.experimental.cache_validation_dataset(dataset_name, "
                    "dataset_version)`.",
                )
            else:
                check.true(
                    self.context.dataset_initialized,
                    "Please pass your datasets (train and test) into "
                    "`context.wrap_dataset(dataset)` right after creating them.",
                )

            if isinstance(ds, tf.data.Dataset):
                ds = ds.repeat()

            return ds
Esempio n. 22
0
    def _prepare_metrics_reducers(self, keys: Any) -> Dict[str, pytorch.Reducer]:
        metrics_reducers = {}  # type: Dict[str, pytorch.Reducer]
        reducer = self.trial.evaluation_reducer()
        if isinstance(reducer, Dict):
            metrics_reducers = reducer
            check.eq(
                metrics_reducers.keys(),
                keys,
                "Please provide a single evaluation reducer or "
                "provide a reducer for every validation metric. "
                f"Expected keys: {keys}, provided keys: {metrics_reducers.keys()}.",
            )
        elif isinstance(reducer, pytorch.Reducer):
            for key in keys:
                metrics_reducers[key] = reducer

        for key in keys:
            check.true(
                isinstance(metrics_reducers[key], pytorch.Reducer),
                "Please select `determined.pytorch.Reducer` for reducing validation metrics.",
            )

        return metrics_reducers
Esempio n. 23
0
    def _combine_metrics_across_processes(
        self, metrics: Dict[str, Any], num_batches: int
    ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]:
        # The chief receives the metric from every other training process.
        check.true(self.hvd_config.use)

        metrics_lists = {}  # type: Dict[str, Any]
        batches_per_process = []  # type: List[int]
        if self.is_chief:
            self.train_process_comm_chief = cast(
                ipc.ZMQBroadcastServer, self.train_process_comm_chief
            )
            worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(lambda: None)
            self.train_process_comm_chief.broadcast(None)
            worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics)

            for metric_name in metrics.keys():
                metrics_lists[metric_name] = [metrics[metric_name]]
                for worker_metric in worker_metrics:
                    metrics_lists[metric_name].append(worker_metric.metrics[metric_name])

            batches_per_process.append(num_batches)
            for worker_metric in worker_metrics:
                batches_per_process.append(worker_metric.num_batches)

            return metrics_lists, batches_per_process
        else:
            self.train_process_comm_worker = cast(
                ipc.ZMQBroadcastClient, self.train_process_comm_worker
            )
            self.train_process_comm_worker.send(
                ipc.MetricsInfo(metrics=metrics, num_batches=num_batches)
            )
            # Synchronize with the chief so that there is no risk of accidentally calling send()
            # for a future gather before all workers have called send() on this gather.
            _ = self.train_process_comm_worker.recv()
            return None, None
Esempio n. 24
0
def _full_storage_path(
    host_path: str,
    storage_path: Optional[str] = None,
    container_path: Optional[str] = None,
) -> str:
    """
    Return the full path to the storage_path, either as a subdirectory of the host_path in the
    host environment, where container_path must be None, or as a subdirectory of the container_path
    when in the container enviornment, where container_path must not be None.
    """
    check.true(os.path.isabs(host_path),
               "`host_path` must be an absolute path.")

    if storage_path is None:
        return host_path if container_path is None else container_path

    # Note that os.path.join() will just return storage_path when it is absolute.
    abs_path = os.path.normpath(os.path.join(host_path, storage_path))
    check.true(abs_path.startswith(host_path),
               "storage path must be a subdirectory of host path.")
    storage_path = os.path.relpath(abs_path, host_path)

    return os.path.join(
        host_path if container_path is None else container_path, storage_path)
Esempio n. 25
0
    def from_native(context: det.NativeContext, *args: Any,
                    **kwargs: Any) -> det.TrialController:
        check.is_instance(
            context,
            estimator.EstimatorNativeContext,
            "EstimatorTrialController needs an EstimatorSprinkleContext",
        )
        context = cast(estimator.EstimatorNativeContext, context)

        check.true(
            hasattr(context, "estimator") and hasattr(context, "train_spec")
            and hasattr(context, "eval_spec"),
            "Please call TFEstimatorExperiment.train_and_evaluate().",
        )

        return EstimatorTrialController(
            context.estimator,
            context.train_spec,
            context.eval_spec,
            context.serving_input_receiver_fns,
            context,
            *args,
            **kwargs,
        )
 def __enter__(self) -> "RunpyGlobals":
     check.true(RunpyGlobals._instance is None,
                "Please only use RunpyGlobals context once at a time.")
     RunpyGlobals._instance = self
     return self
Esempio n. 27
0
    def step_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        clip_grads: Optional[Callable[[Iterator], None]] = None,
        auto_zero_grads: bool = True,
        scaler: Optional[Any] = None,
        # Should be torch.cuda.amp.GradScaler, but:
        #   * other implementations might be possible
        #   * requiring this type forces upgrades to PyTorch 1.6+
    ) -> None:
        """
        Perform a single optimization step.

        This function must be called once for each optimizer. However, the order of
        different optimizers' steps can be specified by calling this function in different
        orders. Also, gradient accumulation across iterations is performed by the Determined
        training loop by setting the experiment configuration field
        :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`.

        Here is a code example:

        .. code-block:: python

            def clip_grads(params):
                torch.nn.utils.clip_grad_norm_(params, 0.0001),

            self.context.step_optimizer(self.opt1, clip_grads)

        Arguments:
            optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped.
            clip_grads(a function, optional): This function should have one argument for
                parameters in order to clip the gradients.
            auto_zero_grads(bool, optional): Automatically zero out gradients automatically after
                stepping the optimizer. If false, you need to call ``optimizer.zero_grad()``
                manually. Note that if :ref:`optimizations.aggregation_frequency
                <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true.
            scaler(``torch.cuda.amp.GradScaler``, optional): The scaler to use for stepping the
                optimizer. This should be unset if not using AMP, and is necessary if
                ``wrap_scaler()`` was called directly.
        """

        check.true(
            auto_zero_grads or self._aggregation_frequency == 1,
            "if optimizations.aggregation_frequency is larger than 1, "
            "you can only set auto_zero_grads to be true. ",
        )

        if not self._should_communicate_and_update():
            return

        # Communication needs to be synchronized so that is completed
        # before we apply gradient clipping and `step()`.
        # In the case of APEX this is called in backward() instead, so that it's inside the context
        # manager and before unscaling.
        # In the case of PyTorch DDP, losses are synchronized during the backwards() pass.
        if (self.distributed.size > 1
                and self._distributed_backend.use_horovod()
                and not self._use_apex):
            with self._record_timing("train_batch.sync_optimizers",
                                     accumulate=True):
                optimizer.synchronize()  # type: ignore

        parameters = ([
            p for group in optimizer.param_groups
            for p in group.get("params", [])
        ] if not self._use_apex else apex.amp.master_params(optimizer))

        if self._average_aggregated_gradients:
            self._average_gradients(parameters=parameters,
                                    divisor=self._aggregation_frequency)

        if clip_grads is not None:
            if self._scaler and self.experimental._auto_amp:
                self._scaler.unscale_(optimizer)
            clip_grads(parameters)

        # For stepping the optimizer we will operate on the scaler passed
        # in, or fall back to the wrapped scaler (if any).
        if scaler is None and self.experimental._auto_amp:
            scaler = self._scaler
        if scaler:

            def step_fn() -> None:
                scaler.step(optimizer)  # type: ignore

        else:
            step_fn = optimizer.step  # type: ignore

        # In the case of PyTorch DDP, losses are synchronized automatically on the backwards() pass
        if self.distributed.size > 1 and self._distributed_backend.use_horovod(
        ):
            with optimizer.skip_synchronize():  # type: ignore
                step_fn()
        else:
            step_fn()

        if auto_zero_grads:
            optimizer.zero_grad()
Esempio n. 28
0
    def configure_apex_amp(
        self,
        models: Union[torch.nn.Module, List[torch.nn.Module]],
        optimizers: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]],
        enabled: Optional[bool] = True,
        opt_level: Optional[str] = "O1",
        cast_model_type: Optional[torch.dtype] = None,
        patch_torch_functions: Optional[bool] = None,
        keep_batchnorm_fp32: Optional[Union[bool, str]] = None,
        master_weights: Optional[bool] = None,
        loss_scale: Optional[Union[float, str]] = None,
        cast_model_outputs: Optional[torch.dtype] = None,
        num_losses: Optional[int] = 1,
        verbosity: Optional[int] = 1,
        min_loss_scale: Optional[float] = None,
        max_loss_scale: Optional[float] = 2.0**24,
    ) -> Tuple:
        """
        Configure automatic mixed precision for your models and optimizers using NVIDIA's Apex
        PyTorch extension. Note that details for apex.amp are handled automatically within
        Determined after this call.

        This function must be called **after** you have finished constructing your models and
        optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`.

        This function has the same arguments as
        `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_.

        .. warning::
            When using distributed training and automatic mixed precision,
            we only support ``num_losses=1`` and calling backward on the loss once.

        Arguments:
            models (``torch.nn.Module`` or list of ``torch.nn.Module`` s):  Model(s) to modify/cast.
            optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s):
                Optimizers to modify/cast. REQUIRED for training.
            enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops,
                so your script should run as if Amp were not present.
            opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.
                Accepted values are "O0", "O1", "O2", and "O3", explained in detail above.
            cast_model_type (``torch.dtype``, optional, default=None):  Optional property override,
                see above.
            patch_torch_functions (bool, optional, default=None):  Optional property override.
            keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.
                If passed as a string, must be the string "True" or "False".
            master_weights (bool, optional, default=None):  Optional property override.
            loss_scale (float or str, optional, default=None):  Optional property override.
                If passed as a string, must be a string representing a number, e.g., "128.0",
                or the string "dynamic".
            cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that
                the outputs of your model is always cast to a particular type regardless of
                ``opt_level``.
            num_losses (int, optional, default=1):  Option to tell Amp in advance how many
                losses/backward passes you plan to use.  When used in conjunction with the
                ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different
                loss scale per loss/backward pass, which can improve stability.
                If ``num_losses`` is left to 1, Amp will still support multiple losses/backward
                passes, but use a single global loss scale for all of them.
            verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
            min_loss_scale (float, default=None):  Sets a floor for the loss scale values that
                can be chosen by dynamic loss scaling.  The default value of None means that no
                floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored.
            max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values
                that can be chosen by dynamic loss scaling.  If dynamic loss scaling is not used,
                `max_loss_scale` is ignored.

        Returns:
            Model(s) and optimizer(s) modified according to the ``opt_level``.
            If  ``optimizers`` args were lists, the corresponding return value will
            also be a list.
        """
        if not self.env.managed_training:
            return models, optimizers

        check.is_none(self._scaler, "Do not mix APEX with PyTorch AMP")

        check.false(self._use_apex,
                    "Please only call configure_apex_amp once.")
        if self.distributed.size > 1:
            check.eq(
                num_losses,
                1,
                "When using parallel/distributed training, "
                "Determined only supports configure_apex_amp with num_losses = 1",
            )

        self._use_apex = True

        if self.distributed.size > 1:
            check.eq(
                self._aggregation_frequency,
                1,
                "Mixed precision training (AMP) is not supported with "
                "aggregation frequency > 1.",
            )

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        if self._distributed_backend.use_torch():
            # We need to get the pre-wrapped input models to initialize APEX because
            if isinstance(models, list):
                models = [
                    self._wrapped_models[wrapped_model]
                    for wrapped_model in models
                ]
            else:
                models = self._wrapped_models[models]

        logging.info(
            f"Enabling mixed precision training with opt_level: {opt_level}.")
        models, optimizers = apex.amp.initialize(
            models=models,
            optimizers=optimizers,
            enabled=enabled,
            opt_level=opt_level,
            cast_model_type=cast_model_type,
            patch_torch_functions=patch_torch_functions,
            keep_batchnorm_fp32=keep_batchnorm_fp32,
            master_weights=master_weights,
            loss_scale=loss_scale,
            cast_model_outputs=cast_model_outputs,
            num_losses=num_losses,
            min_loss_scale=min_loss_scale,
            max_loss_scale=max_loss_scale,
            verbosity=verbosity if self.distributed.get_rank() == 0
            or self.env.experiment_config.debug_enabled() else 0,
        )

        if not isinstance(models, list):
            self.models = [models]

        if self.distributed.size > 1 and self._distributed_backend.use_torch():
            # If Torch DDP is in use, re-wrap the models
            self.models = [
                self._PyTorchDistributedDataParallel(model)
                for model in self.models
            ]

        if not isinstance(optimizers, list):
            self.optimizers = [optimizers]
        return models, optimizers
Esempio n. 29
0
 def _check_if_trial_supports_configurations(self,
                                             env: det.EnvContext) -> None:
     if env.experiment_config.averaging_training_metrics_enabled():
         check.true(self.supports_averaging_training_metrics())
Esempio n. 30
0
    def _compute_validation_metrics(self) -> workload.Response:
        self.context.reset_reducers()
        # Set the behavior of certain layers (e.g., dropout) that are
        # different between training and inference.
        for model in self.context.models:
            model.eval()

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_start,
                                  pytorch.PyTorchCallback):
                logging.warning("on_validation_step_start is now deprecated, "
                                "please use on_validation_start instead")
                callback.on_validation_step_start()

        for callback in self.callbacks.values():
            callback.on_validation_start()

        num_inputs = 0
        metrics = {}  # type: Dict[str, Any]

        if self._evaluate_batch_defined():
            keys = None
            batch_metrics = []

            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            check.gt(len(self.validation_loader), 0)
            for callback in self.callbacks.values():
                callback.on_validation_epoch_start()
            for idx, batch in enumerate(self.validation_loader):
                batch = self.context.to_device(batch)
                num_inputs += self.trial.get_batch_length(batch)

                if has_param(self.trial.evaluate_batch, "batch_idx", 2):
                    vld_metrics = self.trial.evaluate_batch(batch=batch,
                                                            batch_idx=idx)
                else:
                    vld_metrics = self.trial.evaluate_batch(
                        batch=batch)  # type: ignore
                # Verify validation metric names are the same across batches.
                if keys is None:
                    keys = vld_metrics.keys()
                else:
                    check.eq(
                        keys,
                        vld_metrics.keys(),
                        "Validation metric names must match across all batches of data.",
                    )
                check.is_instance(
                    vld_metrics,
                    dict,
                    "validation_metrics() must return a "
                    "dictionary of string names to Tensor "
                    "metrics",
                )
                # TODO: For performance perform -> cpu() only at the end of validation.
                batch_metrics.append(
                    self._convert_metrics_to_numpy(vld_metrics))
                if self.env.test_mode:
                    break

            for callback in self.callbacks.values():
                callback.on_validation_epoch_end(batch_metrics)

            metrics = self._reduce_metrics(
                batch_metrics=batch_metrics,
                keys=keys,
                metrics_reducers=self._prepare_metrics_reducers(keys=keys),
            )

            if self.hvd_config.use:
                num_inputs *= hvd.size()

        else:
            check.true(self._evaluate_full_dataset_defined())
            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            if self.is_chief:
                metrics = self.trial.evaluate_full_dataset(
                    data_loader=self.validation_loader)

                check.is_instance(
                    metrics, dict,
                    f"eval() must return a dictionary, got {type(metrics)}.")

                metrics = self._convert_metrics_to_numpy(metrics)
                num_inputs = self.context.get_per_slot_batch_size() * len(
                    self.validation_loader)

        metrics.update(
            self._convert_metrics_to_numpy(
                self.context.reduce_metrics(for_training=False)))

        if self.hvd_config.use and any(
                map(
                    lambda c: util.is_overridden(
                        c.on_validation_end, pytorch.
                        PyTorchCallback) or util.is_overridden(
                            c.on_validation_step_end, pytorch.PyTorchCallback),
                    self.callbacks.values(),
                )):
            logging.debug(
                "Broadcasting metrics to all worker processes to execute a "
                "validation step end callback")
            metrics = hvd.broadcast_object(metrics, root_rank=0)

        for callback in self.callbacks.values():
            if util.is_overridden(callback.on_validation_step_end,
                                  pytorch.PyTorchCallback):
                logging.warning(
                    "on_validation_step_end is now deprecated, please use on_validation_end instead"
                )
                callback.on_validation_step_end(metrics)

        for callback in self.callbacks.values():
            callback.on_validation_end(metrics)

        if not self.is_chief:
            return workload.Skipped()

        return {"num_inputs": num_inputs, "validation_metrics": metrics}