Beispiel #1
0
    def _compute_validation_metrics(self) -> workload.Response:
        metrics = self._launch_evaluate()
        num_inputs = self.multiplexer.get_test_inputs()

        if self.hvd_config.use:
            # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce
            # may hang when called minutes apart by different workers which may happen if
            # workers complete evaluation at different speeds.
            self._global_barrier()

            num_inputs = hvd.allreduce(num_inputs,
                                       average=False,
                                       name="validation_num_inputs")
            if isinstance(num_inputs, EagerTensor):
                # Horovod will promote an int to a tensor in eager mode.
                num_inputs = num_inputs.numpy()

        metrics = self._allreduce_logs(metrics)
        check.gt(len(metrics), 0)

        self.multiplexer._test_end(metrics)

        if not self.is_chief:
            return workload.Skipped()

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Beispiel #2
0
    def compute_validation_metrics(self) -> workload.Response:
        (
            validation_data,
            validation_steps,
        ) = self._validation_input_manager.get_validation_input_and_num_batches(
        )

        metrics_values = self.model.evaluate(validation_data,
                                             steps=validation_steps,
                                             verbose=0)

        # If the model was compiled with metrics=None, metrics_value will be a single value.
        if not isinstance(metrics_values, (tuple, list)):
            metrics_values = (metrics_values, )

        if self.hvd_config.use:
            for index, metric_value in enumerate(metrics_values):
                metrics_values[index] = np.array(hvd.allreduce(metric_value))

        num_inputs = self._validation_input_manager.stop_validation_input_and_get_num_inputs(
        )

        if not self.is_chief:
            return workload.Skipped()

        metrics = make_logs(self.model, {},
                            metrics_values,
                            ModeKeys.TEST,
                            prefix="val_")
        check.gt(len(metrics), 0)

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Beispiel #3
0
    def __init__(
        self,
        sequence: tf.keras.utils.Sequence,
        sampler: _Sampler,
        repeat: bool,
        workers: int,
        max_queue_size: int,
    ):
        self.sequence = sequence
        self.sampler = sampler
        self.repeat = repeat
        self.max_queue_size = max_queue_size
        check.gt(max_queue_size, 0, "max_queue_size must be greater than zero")

        # Coordination logic.
        self.order = 0
        self.requested = collections.deque()  # type: Deque[int]
        self.received = {}  # type: Dict[int, Any]
        self.started = False
        self.stopped = False
        self.index_iter = None  # type: Optional[Iterator]

        # Interthread/interprocess communications.
        self.queries = self.queue_class()()
        self.answers = self.queue_class()()

        self.workers = [
            self.worker_class()(target=_worker,
                                args=(self.sequence, self.queries,
                                      self.answers)) for _ in range(workers)
        ]
    def __init__(self, batch_sampler: torch.utils.data.BatchSampler,
                 num_workers: int, rank: int) -> None:
        check.gt(rank, -1, "rank must be non-negative")
        check.gt(num_workers, 0, "num_workers must be positive")
        check.lt(rank, num_workers, "rank must be less than num_workers")

        self.batch_sampler = batch_sampler
        self.num_workers = num_workers
        self.rank = rank
 def _init_device(self) -> None:
     self.n_gpus = len(self.env.container_gpus)
     if self.hvd_config.use:
         check.gt(self.n_gpus, 0)
         # We launch a horovod process per GPU. Each process
         # needs to bind to a unique GPU.
         self.device = torch.device(hvd.local_rank())
         torch.cuda.set_device(self.device)
     elif self.n_gpus > 0:
         self.device = torch.device("cuda", 0)
     else:
         self.device = torch.device("cpu")
     check.is_not_none(self.device)
Beispiel #6
0
    def _send_recv_workload(self, wkld: workload.Workload,
                            args: List[Any]) -> workload.Response:
        # Broadcast every workload to every worker on this machine.
        self.broadcast_server.broadcast((wkld, args))

        if wkld.kind == workload.Workload.Kind.TERMINATE:
            # Do not perform health checks once worker have been instructed to terminate.
            self._worker_process_ids = []

        try:
            responses, exception_received = self.broadcast_server.gather_with_polling(
                self._health_check)
        except det.errors.WorkerError:
            if wkld.kind == workload.Workload.Kind.TERMINATE:
                return {}
            raise

        if exception_received:
            raise det.errors.WorkerError("Training process died.")

        # Find the response from the chief worker for the trial (the only non-SkippedWorkload). The
        # chief may report to another container, in which case we will only have SkippedWorkloads.
        chief_worker_response = None  # Optional[workload.Metrics]
        for response in responses:
            if isinstance(response, workload.Skipped):
                continue
            # Any other response must be a Dict[str, Any]-like object.
            check.is_instance(
                response, dict,
                f"Received non-metrics object from worker: {response}")
            # There should only be one chief response.
            check.is_none(chief_worker_response,
                          "Received multiple non-SkippedWorkload messages.")
            chief_worker_response = cast(Dict[str, Any], response)

        # Confirm that if we have did not see a chief response then we are not the chief machine.
        if chief_worker_response is None:
            check.gt(
                self.rendezvous_info.get_rank(),
                0,
                "Received SkippedWorkload message from chief worker.",
            )

        return workload.Skipped(
        ) if chief_worker_response is None else chief_worker_response
    def _compute_validation_metrics(self) -> workload.Response:
        self.context.experimental.reset_reducers()
        # Set the behavior of certain layers (e.g., dropout) that are
        # different between training and inference.
        for model in self.context.models:
            model.eval()

        for callback in self.callbacks.values():
            logging.warning(
                "on_validation_step_start is now deprecated, please use on_validation_start instead"
            )
            callback.on_validation_step_start()

        for callback in self.callbacks.values():
            callback.on_validation_start()

        num_inputs = 0
        metrics = {}  # type: Dict[str, Any]

        if self._evaluate_batch_defined():
            keys = None
            batch_metrics = []

            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            check.gt(len(self.validation_loader), 0)
            for batch in self.validation_loader:
                batch = self.context.to_device(batch)
                num_inputs += pytorch.data_length(batch)

                vld_metrics = self.trial.evaluate_batch(batch=batch)
                # Verify validation metric names are the same across batches.
                if keys is None:
                    keys = vld_metrics.keys()
                else:
                    check.eq(
                        keys,
                        vld_metrics.keys(),
                        "Validation metric names must match across all batches of data.",
                    )
                check.is_instance(
                    vld_metrics,
                    dict,
                    "validation_metrics() must return a "
                    "dictionary of string names to Tensor "
                    "metrics",
                )
                # TODO: For performance perform -> cpu() only at the end of validation.
                batch_metrics.append(
                    self._convert_metrics_to_numpy(vld_metrics))
                if self.env.test_mode:
                    break

            metrics = self._reduce_metrics(
                batch_metrics=batch_metrics,
                keys=keys,
                metrics_reducers=self._prepare_metrics_reducers(keys=keys),
            )

            if self.hvd_config.use:
                num_inputs *= hvd.size()

        else:
            check.true(self._evaluate_full_dataset_defined())
            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            if self.is_chief:
                metrics = self.trial.evaluate_full_dataset(
                    data_loader=self.validation_loader)

                check.is_instance(
                    metrics, dict,
                    f"eval() must return a dictionary, got {type(metrics)}.")

                metrics = self._convert_metrics_to_numpy(metrics)
                num_inputs = self.context.get_per_slot_batch_size() * len(
                    self.validation_loader)

        metrics.update(
            self._convert_metrics_to_numpy(
                self.context.experimental.reduce_metrics(for_training=False)))

        if self.hvd_config.use and any(
                map(
                    lambda c: util.is_overridden(
                        c.on_validation_end, pytorch.
                        PyTorchCallback) or util.is_overridden(
                            c.on_validation_step_end, pytorch.PyTorchCallback),
                    self.callbacks.values(),
                )):
            logging.debug(
                "Broadcasting metrics to all worker processes to execute a "
                "validation step end callback")
            metrics = hvd.broadcast_object(metrics, root_rank=0)

        for callback in self.callbacks.values():
            logging.warning(
                "on_validation_step_end is now deprecated, please use on_validation_end instead"
            )
            callback.on_validation_step_end(metrics)

        for callback in self.callbacks.values():
            callback.on_validation_end(metrics)

        if not self.is_chief:
            return workload.Skipped()

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
    def _train_for_step(self, step_id: int, num_batches: int,
                        total_batches_processed: int) -> workload.Response:
        check.gt(step_id, 0)
        self.context.experimental.reset_reducers()

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        for model in self.context.models:
            model.train()

        start = total_batches_processed
        end = start + num_batches

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            batch = next(self.training_iterator)

            ## old code:
            # num_inputs += pytorch.data_length(batch)
            # batch = self.context.to_device(batch)

            num_inputs += self.trial._records_in_batch(batch)
            batch = self.trial._batch_to_device(batch, self.context)

            self.context._current_batch_idx = batch_idx
            self.context._loss_ids = {}
            tr_metrics = self.trial.train_batch(
                batch=batch,
                epoch_idx=self.get_epoch_idx(batch_idx),
                batch_idx=batch_idx,
            )
            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}
            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )

            # Step learning rate of a pytorch.LRScheduler.
            for lr_scheduler in self.context.lr_schedulers:
                self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler)

            for name, metric in tr_metrics.items():
                # Convert PyTorch metric values to NumPy, so that
                # `det.util.encode_json` handles them properly without
                # needing a dependency on PyTorch.
                if isinstance(metric, torch.Tensor):
                    metric = metric.cpu().detach().numpy()
                tr_metrics[name] = metric

            per_batch_metrics.append(tr_metrics)

        # Aggregate and reduce training metrics from all the training processes.
        if self.hvd_config.use and self.hvd_config.average_training_metrics:
            per_batch_metrics = self._average_training_metrics(
                per_batch_metrics)
        if self.hvd_config.use:
            num_inputs *= hvd.size()
        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch
        # metrics are even logical for a custom reducer.
        metrics["avg_metrics"].update(
            self._convert_metrics_to_numpy(
                self.context.experimental.reduce_metrics(for_training=True)))

        if not self.is_chief:
            # The training metrics are reported only in the chief process.
            return workload.Skipped()

        logging.debug(
            f"Done training step: {num_inputs} records in {num_batches} batches."
        )

        return metrics
Beispiel #9
0
    def _train_for_step(self, step_id: int, num_batches: int,
                        total_batches_processed: int) -> workload.Response:
        check.gt(step_id, 0)

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        for model in self.context.models:
            model.train()

        start = total_batches_processed
        end = start + num_batches

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            batch = next(self.training_iterator)
            num_inputs += data_length(batch)
            batch = self.context._to_device(batch)

            self.context._current_batch_idx = batch_idx
            self.context._loss_ids = {}
            tr_metrics = self.trial.train_batch(
                batch=batch,
                model=self.context.models[0],
                epoch_idx=self.get_epoch_idx(batch_idx),
                batch_idx=batch_idx,
            )
            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}
            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )
            check.is_in("loss", tr_metrics.keys(),
                        'Please include "loss" in you training metrics.')

            # Step learning rate of a LRScheduler.
            for lr_scheduler in self.context.lr_schedulers:
                self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler)

            for name, metric in tr_metrics.items():
                # Convert PyTorch metric values to NumPy, so that
                # `det.util.encode_json` handles them properly without
                # needing a dependency on PyTorch.
                if isinstance(metric, torch.Tensor):
                    metric = metric.cpu().detach().numpy()
                tr_metrics[name] = metric

            check.is_in("loss", tr_metrics,
                        'Please include "loss" in your training metrics.')
            per_batch_metrics.append(tr_metrics)

        # Aggregate and reduce training metrics from all the training processes.
        if self.hvd_config.use and self.hvd_config.average_training_metrics:
            per_batch_metrics = self._average_training_metrics(
                per_batch_metrics)
        if self.hvd_config.use:
            num_inputs *= hvd.size()
        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        if not self.is_chief:
            # The training metrics are reported only in the chief process.
            return workload.Skipped()

        logging.debug(
            f"Done training step: {num_inputs} records in {num_batches} batches."
        )

        return metrics
Beispiel #10
0
    def _compute_validation_metrics(self) -> workload.Response:
        # Set the behavior of certain layers (e.g., dropout) that are
        # different between training and inference.
        self.model.eval()
        num_inputs = 0
        metrics = {}  # type: Optional[Dict[str, Any]]

        if self._evaluate_batch_defined():
            keys = None
            batch_metrics = []

            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            check.gt(len(self.validation_loader), 0)
            for batch in self.validation_loader:
                batch = self._to_device(batch)
                num_inputs += data_length(batch)

                vld_metrics = self.trial.evaluate_batch(batch=batch,
                                                        model=self.model)
                # Verify validation metric names are the same across batches.
                if keys is None:
                    keys = vld_metrics.keys()
                else:
                    check.eq(
                        keys,
                        vld_metrics.keys(),
                        "Validation metric names must match across all batches of data.",
                    )
                check.is_instance(
                    vld_metrics,
                    dict,
                    "validation_metrics() must return a "
                    "dictionary of string names to Tensor "
                    "metrics",
                )
                # TODO: For performance perform -> cpu() only at the end of validation.
                batch_metrics.append(
                    self._convert_metrics_to_numpy(vld_metrics))

            keys = cast(Any, keys)
            metrics = self._reduce_metrics(
                batch_metrics=batch_metrics,
                keys=keys,
                metrics_reducers=self._prepare_metrics_reducers(keys=keys),
            )

            if self.hvd_config.use:
                num_inputs *= hvd.size()

        else:
            check.true(self._evaluate_full_dataset_defined())
            self.validation_loader = cast(torch.utils.data.DataLoader,
                                          self.validation_loader)
            if self.is_chief:
                metrics = self.trial.evaluate_full_dataset(
                    data_loader=self.validation_loader, model=self.model)

                check.is_instance(
                    metrics, dict,
                    f"eval() must return a dictionary, got {type(metrics)}.")

                metrics = self._convert_metrics_to_numpy(metrics)
                num_inputs = self.context.get_per_slot_batch_size() * len(
                    self.validation_loader)

        if not self.is_chief:
            return workload.Skipped()

        return {"num_inputs": num_inputs, "validation_metrics": metrics}
Beispiel #11
0
    def _train_for_step(self, step_id: int,
                        batches_per_step: int) -> workload.Response:
        check.gt(step_id, 0)

        step_idx = step_id - 1
        start = step_idx * batches_per_step
        end = start + batches_per_step

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        self.model.train()

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            batch = next(self.training_iterator)
            num_inputs += data_length(batch)

            batch = self._to_device(batch)
            # Forward pass.
            tr_metrics = self.trial.train_batch(
                batch=batch,
                model=self.model,
                epoch_idx=self.get_epoch_idx(batch_idx),
                batch_idx=batch_idx,
            )

            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}

            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                "mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )
            check.is_in("loss", tr_metrics.keys(),
                        'Please include "loss" in you training metrics.')

            # Backwards pass.
            loss = tr_metrics["loss"]
            communicate_and_update = (
                batch_idx + 1) % self.hvd_config.aggregation_frequency == 0
            if self.use_amp():
                with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                    if self.hvd_config.use and communicate_and_update:
                        self.optimizer.synchronize()
            else:
                loss.backward()

            if communicate_and_update:
                parameters = (self.model.parameters() if not self.use_amp()
                              else apex.amp.master_params(self.optimizer))

                if self.hvd_config.average_aggregated_gradients:
                    self._average_gradients(
                        parameters=parameters,
                        divisor=self.hvd_config.aggregation_frequency)

                self._clip_grads(parameters)

                if self.hvd_config.use and self.use_amp():
                    with self.optimizer.skip_synchronize():
                        self.optimizer.step()
                else:
                    self.optimizer.step()
                self.optimizer.zero_grad()

                if self.lr_helper.should_step_lr(
                        batches_completed=batch_idx + 1,
                        epoch_length=len(self.training_loader),
                        aggregation_frequency=self.hvd_config.
                        aggregation_frequency,
                ):
                    self.lr_helper.step()

            for name, metric in tr_metrics.items():
                # Convert PyTorch metric values to NumPy, so that
                # `det.util.encode_json` handles them properly without
                # needing a dependency on PyTorch.
                if isinstance(metric, torch.Tensor):
                    metric = metric.cpu().detach().numpy()
                tr_metrics[name] = metric

            check.is_in("loss", tr_metrics,
                        'Please include "loss" in your training metrics.')
            per_batch_metrics.append(tr_metrics)

        if self.hvd_config.use and self.hvd_config.average_training_metrics:
            per_batch_metrics = self._average_training_metrics(
                per_batch_metrics)

        if not self.is_chief:
            return workload.Skipped()

        if self.hvd_config.use:
            num_inputs *= hvd.size()

        logging.debug(
            f"Done training step: {num_inputs} records in {batches_per_step} batches."
        )
        return det.util.make_metrics(num_inputs, per_batch_metrics)
Beispiel #12
0
    def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response:
        check.gt(step_id, 0)

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        self.context.model.train()

        for callback in self.callbacks.values():
            callback.on_train_step_start(step_id)

        step_idx = step_id - 1
        start = step_idx * batches_per_step
        end = start + batches_per_step

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            batch = next(self.training_iterator)
            num_inputs += data_length(batch)

            batch = self._to_device(batch)
            # Forward pass.
            tr_metrics = self.trial.train_batch(
                batch=batch,
                model=self.context.model,
                epoch_idx=self.get_epoch_idx(batch_idx),
                batch_idx=batch_idx,
            )

            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}

            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                "mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )
            check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.')

            # Backwards pass.
            loss = tr_metrics["loss"]
            communicate_and_update = (batch_idx + 1) % self.hvd_config.aggregation_frequency == 0
            if self.use_amp():
                with apex.amp.scale_loss(loss, self.context.optimizer) as scaled_loss:
                    scaled_loss.backward()
                    if self.hvd_config.use and communicate_and_update:
                        # When using horovod, we need to finish communicating gradient
                        # updates before they are unscaled which happens when we exit
                        # of this context manager.
                        self.context.optimizer.synchronize()
            else:
                loss.backward()

                # Communication needs to be synchronized so that is completed
                # before we apply gradient clipping and `step()`.
                if communicate_and_update and self.hvd_config.use:
                    self.context.optimizer.synchronize()

            if communicate_and_update:
                parameters = (
                    self.context.model.parameters()
                    if not self.use_amp()
                    else apex.amp.master_params(self.context.optimizer)
                )

                if self.hvd_config.average_aggregated_gradients:
                    self._average_gradients(
                        parameters=parameters, divisor=self.hvd_config.aggregation_frequency
                    )

                # TODO: Remove this check in v0.12.8.
                check.false(
                    self.env.hparams.get("clip_grad_l2_norm", None)
                    or self.env.hparams.get("clip_grad_val", None),
                    "Please specify gradient clipping via callbacks.",
                )

                for callback in self.callbacks.values():
                    callback.on_before_optimizer_step(parameters)

                if self.hvd_config.use:
                    with self.context.optimizer.skip_synchronize():
                        self.context.optimizer.step()
                else:
                    self.context.optimizer.step()
                self.context.optimizer.zero_grad()

                # Step learning rate of a LRScheduler.
                if self.context.lr_scheduler is not None:
                    self._auto_step_lr_scheduler_per_batch(batch_idx, self.context.lr_scheduler)

            for name, metric in tr_metrics.items():
                # Convert PyTorch metric values to NumPy, so that
                # `det.util.encode_json` handles them properly without
                # needing a dependency on PyTorch.
                if isinstance(metric, torch.Tensor):
                    metric = metric.cpu().detach().numpy()
                tr_metrics[name] = metric

            check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.')
            per_batch_metrics.append(tr_metrics)

        if self.hvd_config.use and self.hvd_config.average_training_metrics:
            per_batch_metrics = self._average_training_metrics(per_batch_metrics)

        if self.hvd_config.use:
            num_inputs *= hvd.size()

        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        for callback in self.callbacks.values():
            callback.on_train_step_end(step_id, metrics)

        if not self.is_chief:
            return workload.Skipped()

        logging.debug(f"Done training step: {num_inputs} records in {batches_per_step} batches.")

        return metrics