def test_multi_gpu_model_dp(tmpdir): """ Make sure DP works :return: """ tutils.reset_seed() if not tutils.can_run_gpu_test(): return model, hparams = tutils.get_model() trainer_options = dict( default_save_path=tmpdir, show_progress_bar=False, distributed_backend='dp', max_nb_epochs=1, train_percent_check=0.1, val_percent_check=0.1, gpus='-1' ) tutils.run_model_test(trainer_options, model, hparams) # test memory helper functions memory.get_memory_profile('min_max')
def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step Args: metrics (dict): Metric values grad_norm_dic (dict): Gradient norms step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step` """ # add gpu memory if self.on_gpu and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # add norms metrics.update(grad_norm_dic) # turn all tensors to scalars scalar_metrics = self.metrics_to_scalars(metrics) if "step" in scalar_metrics and step is None: step = scalar_metrics.pop("step") else: # added metrics by Lightning for convenience metrics['epoch'] = self.current_epoch step = step if step is not None else self.global_step # log actual metrics if self.proc_rank == 0 and self.logger is not None: self.logger.log_metrics(scalar_metrics, step=step) self.logger.save()
def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. :param metrics: :param grad_norm_dic: """ # added metrics by Lightning for convenience metrics['epoch'] = self.current_epoch # add gpu memory if self.on_gpu and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # add norms metrics.update(grad_norm_dic) # turn all tensors to scalars scalar_metrics = self.metrics_to_scalars(metrics) step = step if step is not None else self.global_step # log actual metrics if self.proc_rank == 0 and self.logger is not None: self.logger.log_metrics(scalar_metrics, step=step) self.logger.save()
def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -> None: """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step Args: metrics: Metric values step: Step for which metrics should be logged. Default value is `self.global_step` during training or the total validation / test log step count during validation and testing. """ if self.trainer.logger is None or not metrics: return # add gpu memory if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # turn all tensors to scalars scalar_metrics = metrics_to_scalars(metrics) if step is None: step = scalar_metrics.pop("step", None) if step is None: # added metrics for convenience scalar_metrics.setdefault("epoch", self.trainer.current_epoch) step = self.trainer.global_step # log actual metrics if self.trainer.is_global_zero: self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step) self.trainer.logger.save() self._logged_metrics.update(scalar_metrics)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', progress_bar_refresh_rate=0) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_multi_gpu_model_dp(tmpdir): """Make sure DP works.""" tutils.reset_seed() model, hparams = tutils.get_default_model() trainer_options = dict(default_save_path=tmpdir, progress_bar_refresh_rate=0, distributed_backend='dp', max_epochs=1, train_percent_check=0.1, val_percent_check=0.1, gpus='-1') tutils.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_multi_gpu_model_dp(tmpdir): tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], accelerator='dp', progress_bar_refresh_rate=0, ) model = BoringModel() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_multi_gpu_model(tmpdir, backend): """Make sure DDP works.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, gpus=[0, 1], distributed_backend=backend, ) model = EvalModelTemplate() # tutils.run_model_test(trainer_options, model) trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result # test memory helper functions memory.get_memory_profile('min_max')
def log_metrics(self, metrics, grad_norm_dic, step=None, log_train_step_metrics=False): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step Args: metrics (dict): Metric values grad_norm_dic (dict): Gradient norms step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step` log_train_step_metrics (bool): Used to track if log_metrics function is being called in during training steps. In training steps, we will log metrics on step: total_nb_idx (for accumulated gradients) and global_step for the rest. """ # add gpu memory if self.trainer.on_gpu and self.trainer.log_gpu_memory: mem_map = memory.get_memory_profile(self.trainer.log_gpu_memory) metrics.update(mem_map) # add norms metrics.update(grad_norm_dic) # turn all tensors to scalars scalar_metrics = self.trainer.metrics_to_scalars(metrics) if "step" in scalar_metrics and step is None: step = scalar_metrics.pop("step") elif step is None: # added metrics by Lightning for convenience if log_train_step_metrics: step = self.trainer.total_batch_idx else: scalar_metrics['epoch'] = self.trainer.current_epoch step = self.trainer.global_step # log actual metrics if self.trainer.logger is not None: if self.trainer.is_global_zero: self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step) self.trainer.logger.save() # track the logged metrics self.logged_metrics.update(scalar_metrics) self.trainer.dev_debugger.track_logged_metrics_history( scalar_metrics)
def log_metrics(self, metrics, grad_norm_dict, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step Args: metrics (dict): Metric values grad_norm_dict (dict): Gradient norms step (int): Step for which metrics should be logged. Default value is `self.global_step` during training or the total validation / test log step count during validation and testing. """ # add gpu memory if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # add norms metrics.update(grad_norm_dict) # turn all tensors to scalars scalar_metrics = metrics_to_scalars(metrics) if "step" in scalar_metrics and step is None: step = scalar_metrics.pop("step") elif step is None: # added metrics by Lightning for convenience scalar_metrics['epoch'] = self.trainer.current_epoch step = self.trainer.global_step # log actual metrics if self.trainer.logger is not None: if self.trainer.is_global_zero: self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step) self.trainer.logger.save() # track the logged metrics self.logged_metrics.update(scalar_metrics) self.trainer.dev_debugger.track_logged_metrics_history( scalar_metrics)
def gpus_metrics(self) -> Dict[str, str]: if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) self._gpus_metrics.update(mem_map) return self._gpus_metrics