def _compute_validation_metrics(self) -> workload.Response: metrics = self._launch_evaluate() num_inputs = self.multiplexer.get_test_inputs() if self.hvd_config.use: # Use a global ZMQ barrier here because we have observed cases where hvd.allreduce # may hang when called minutes apart by different workers which may happen if # workers complete evaluation at different speeds. self._global_barrier() num_inputs = hvd.allreduce(num_inputs, average=False, name="validation_num_inputs") if isinstance(num_inputs, EagerTensor): # Horovod will promote an int to a tensor in eager mode. num_inputs = num_inputs.numpy() metrics = self._allreduce_logs(metrics) check.gt(len(metrics), 0) self.multiplexer._test_end(metrics) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def compute_validation_metrics(self) -> workload.Response: ( validation_data, validation_steps, ) = self._validation_input_manager.get_validation_input_and_num_batches( ) metrics_values = self.model.evaluate(validation_data, steps=validation_steps, verbose=0) # If the model was compiled with metrics=None, metrics_value will be a single value. if not isinstance(metrics_values, (tuple, list)): metrics_values = (metrics_values, ) if self.hvd_config.use: for index, metric_value in enumerate(metrics_values): metrics_values[index] = np.array(hvd.allreduce(metric_value)) num_inputs = self._validation_input_manager.stop_validation_input_and_get_num_inputs( ) if not self.is_chief: return workload.Skipped() metrics = make_logs(self.model, {}, metrics_values, ModeKeys.TEST, prefix="val_") check.gt(len(metrics), 0) return {"num_inputs": num_inputs, "validation_metrics": metrics}
def __init__( self, sequence: tf.keras.utils.Sequence, sampler: _Sampler, repeat: bool, workers: int, max_queue_size: int, ): self.sequence = sequence self.sampler = sampler self.repeat = repeat self.max_queue_size = max_queue_size check.gt(max_queue_size, 0, "max_queue_size must be greater than zero") # Coordination logic. self.order = 0 self.requested = collections.deque() # type: Deque[int] self.received = {} # type: Dict[int, Any] self.started = False self.stopped = False self.index_iter = None # type: Optional[Iterator] # Interthread/interprocess communications. self.queries = self.queue_class()() self.answers = self.queue_class()() self.workers = [ self.worker_class()(target=_worker, args=(self.sequence, self.queries, self.answers)) for _ in range(workers) ]
def __init__(self, batch_sampler: torch.utils.data.BatchSampler, num_workers: int, rank: int) -> None: check.gt(rank, -1, "rank must be non-negative") check.gt(num_workers, 0, "num_workers must be positive") check.lt(rank, num_workers, "rank must be less than num_workers") self.batch_sampler = batch_sampler self.num_workers = num_workers self.rank = rank
def _init_device(self) -> None: self.n_gpus = len(self.env.container_gpus) if self.hvd_config.use: check.gt(self.n_gpus, 0) # We launch a horovod process per GPU. Each process # needs to bind to a unique GPU. self.device = torch.device(hvd.local_rank()) torch.cuda.set_device(self.device) elif self.n_gpus > 0: self.device = torch.device("cuda", 0) else: self.device = torch.device("cpu") check.is_not_none(self.device)
def _send_recv_workload(self, wkld: workload.Workload, args: List[Any]) -> workload.Response: # Broadcast every workload to every worker on this machine. self.broadcast_server.broadcast((wkld, args)) if wkld.kind == workload.Workload.Kind.TERMINATE: # Do not perform health checks once worker have been instructed to terminate. self._worker_process_ids = [] try: responses, exception_received = self.broadcast_server.gather_with_polling( self._health_check) except det.errors.WorkerError: if wkld.kind == workload.Workload.Kind.TERMINATE: return {} raise if exception_received: raise det.errors.WorkerError("Training process died.") # Find the response from the chief worker for the trial (the only non-SkippedWorkload). The # chief may report to another container, in which case we will only have SkippedWorkloads. chief_worker_response = None # Optional[workload.Metrics] for response in responses: if isinstance(response, workload.Skipped): continue # Any other response must be a Dict[str, Any]-like object. check.is_instance( response, dict, f"Received non-metrics object from worker: {response}") # There should only be one chief response. check.is_none(chief_worker_response, "Received multiple non-SkippedWorkload messages.") chief_worker_response = cast(Dict[str, Any], response) # Confirm that if we have did not see a chief response then we are not the chief machine. if chief_worker_response is None: check.gt( self.rendezvous_info.get_rank(), 0, "Received SkippedWorkload message from chief worker.", ) return workload.Skipped( ) if chief_worker_response is None else chief_worker_response
def _compute_validation_metrics(self) -> workload.Response: self.context.experimental.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() for callback in self.callbacks.values(): logging.warning( "on_validation_step_start is now deprecated, please use on_validation_start instead" ) callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 metrics = {} # type: Dict[str, Any] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for batch in self.validation_loader: batch = self.context.to_device(batch) num_inputs += pytorch.data_length(batch) vld_metrics = self.trial.evaluate_batch(batch=batch) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( self._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break metrics = self._reduce_metrics( batch_metrics=batch_metrics, keys=keys, metrics_reducers=self._prepare_metrics_reducers(keys=keys), ) if self.hvd_config.use: num_inputs *= hvd.size() else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = self._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) metrics.update( self._convert_metrics_to_numpy( self.context.experimental.reduce_metrics(for_training=False))) if self.hvd_config.use and any( map( lambda c: util.is_overridden( c.on_validation_end, pytorch. PyTorchCallback) or util.is_overridden( c.on_validation_step_end, pytorch.PyTorchCallback), self.callbacks.values(), )): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = hvd.broadcast_object(metrics, root_rank=0) for callback in self.callbacks.values(): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: check.gt(step_id, 0) self.context.experimental.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) ## old code: # num_inputs += pytorch.data_length(batch) # batch = self.context.to_device(batch) num_inputs += self.trial._records_in_batch(batch) batch = self.trial._batch_to_device(batch, self.context) self.context._current_batch_idx = batch_idx self.context._loss_ids = {} tr_metrics = self.trial.train_batch( batch=batch, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) # Step learning rate of a pytorch.LRScheduler. for lr_scheduler in self.context.lr_schedulers: self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric per_batch_metrics.append(tr_metrics) # Aggregate and reduce training metrics from all the training processes. if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch # metrics are even logical for a custom reducer. metrics["avg_metrics"].update( self._convert_metrics_to_numpy( self.context.experimental.reduce_metrics(for_training=True))) if not self.is_chief: # The training metrics are reported only in the chief process. return workload.Skipped() logging.debug( f"Done training step: {num_inputs} records in {num_batches} batches." ) return metrics
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: check.gt(step_id, 0) # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self.context._to_device(batch) self.context._current_batch_idx = batch_idx self.context._loss_ids = {} tr_metrics = self.trial.train_batch( batch=batch, model=self.context.models[0], epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Step learning rate of a LRScheduler. for lr_scheduler in self.context.lr_schedulers: self._auto_step_lr_scheduler_per_batch(batch_idx, lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) # Aggregate and reduce training metrics from all the training processes. if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) if not self.is_chief: # The training metrics are reported only in the chief process. return workload.Skipped() logging.debug( f"Done training step: {num_inputs} records in {num_batches} batches." ) return metrics
def _compute_validation_metrics(self) -> workload.Response: # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. self.model.eval() num_inputs = 0 metrics = {} # type: Optional[Dict[str, Any]] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for batch in self.validation_loader: batch = self._to_device(batch) num_inputs += data_length(batch) vld_metrics = self.trial.evaluate_batch(batch=batch, model=self.model) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( self._convert_metrics_to_numpy(vld_metrics)) keys = cast(Any, keys) metrics = self._reduce_metrics( batch_metrics=batch_metrics, keys=keys, metrics_reducers=self._prepare_metrics_reducers(keys=keys), ) if self.hvd_config.use: num_inputs *= hvd.size() else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader, model=self.model) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = self._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) if not self.is_chief: return workload.Skipped() return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response: check.gt(step_id, 0) step_idx = step_id - 1 start = step_idx * batches_per_step end = start + batches_per_step # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. self.model.train() per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self._to_device(batch) # Forward pass. tr_metrics = self.trial.train_batch( batch=batch, model=self.model, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " "mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Backwards pass. loss = tr_metrics["loss"] communicate_and_update = ( batch_idx + 1) % self.hvd_config.aggregation_frequency == 0 if self.use_amp(): with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() if self.hvd_config.use and communicate_and_update: self.optimizer.synchronize() else: loss.backward() if communicate_and_update: parameters = (self.model.parameters() if not self.use_amp() else apex.amp.master_params(self.optimizer)) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency) self._clip_grads(parameters) if self.hvd_config.use and self.use_amp(): with self.optimizer.skip_synchronize(): self.optimizer.step() else: self.optimizer.step() self.optimizer.zero_grad() if self.lr_helper.should_step_lr( batches_completed=batch_idx + 1, epoch_length=len(self.training_loader), aggregation_frequency=self.hvd_config. aggregation_frequency, ): self.lr_helper.step() for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics( per_batch_metrics) if not self.is_chief: return workload.Skipped() if self.hvd_config.use: num_inputs *= hvd.size() logging.debug( f"Done training step: {num_inputs} records in {batches_per_step} batches." ) return det.util.make_metrics(num_inputs, per_batch_metrics)
def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response: check.gt(step_id, 0) # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. self.context.model.train() for callback in self.callbacks.values(): callback.on_train_step_start(step_id) step_idx = step_id - 1 start = step_idx * batches_per_step end = start + batches_per_step per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): batch = next(self.training_iterator) num_inputs += data_length(batch) batch = self._to_device(batch) # Forward pass. tr_metrics = self.trial.train_batch( batch=batch, model=self.context.model, epoch_idx=self.get_epoch_idx(batch_idx), batch_idx=batch_idx, ) if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " "mapping string names to Tensor metrics, got {type(tr_metrics)}", ) check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.') # Backwards pass. loss = tr_metrics["loss"] communicate_and_update = (batch_idx + 1) % self.hvd_config.aggregation_frequency == 0 if self.use_amp(): with apex.amp.scale_loss(loss, self.context.optimizer) as scaled_loss: scaled_loss.backward() if self.hvd_config.use and communicate_and_update: # When using horovod, we need to finish communicating gradient # updates before they are unscaled which happens when we exit # of this context manager. self.context.optimizer.synchronize() else: loss.backward() # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if communicate_and_update and self.hvd_config.use: self.context.optimizer.synchronize() if communicate_and_update: parameters = ( self.context.model.parameters() if not self.use_amp() else apex.amp.master_params(self.context.optimizer) ) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency ) # TODO: Remove this check in v0.12.8. check.false( self.env.hparams.get("clip_grad_l2_norm", None) or self.env.hparams.get("clip_grad_val", None), "Please specify gradient clipping via callbacks.", ) for callback in self.callbacks.values(): callback.on_before_optimizer_step(parameters) if self.hvd_config.use: with self.context.optimizer.skip_synchronize(): self.context.optimizer.step() else: self.context.optimizer.step() self.context.optimizer.zero_grad() # Step learning rate of a LRScheduler. if self.context.lr_scheduler is not None: self._auto_step_lr_scheduler_per_batch(batch_idx, self.context.lr_scheduler) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.') per_batch_metrics.append(tr_metrics) if self.hvd_config.use and self.hvd_config.average_training_metrics: per_batch_metrics = self._average_training_metrics(per_batch_metrics) if self.hvd_config.use: num_inputs *= hvd.size() metrics = det.util.make_metrics(num_inputs, per_batch_metrics) for callback in self.callbacks.values(): callback.on_train_step_end(step_id, metrics) if not self.is_chief: return workload.Skipped() logging.debug(f"Done training step: {num_inputs} records in {batches_per_step} batches.") return metrics