def test_result_gather_different_shapes(): """ Test that tensors of varying shape get gathered into a list. """ outputs = [ {"foo": torch.tensor(1)}, {"foo": torch.zeros(2, 3)}, {"foo": torch.zeros(1, 2, 3)}, ] result = Result.gather(outputs) expected = [torch.tensor(1), torch.zeros(2, 3), torch.zeros(1, 2, 3)] assert isinstance(result["foo"], list) assert all(torch.eq(r, e).all() for r, e in zip(result["foo"], expected))
def cache_result(self) -> None: """ This function is called after every hook and store the result object """ model_ref = self.trainer.get_model() # extract hook results hook_result = model_ref._results # extract model information fx_name, dataloader_idx = self.current_model_info() # add only if anything as been logged # default len is 1 due to _internals if len(hook_result) > 1: if fx_name not in self._internals: self._internals[fx_name] = HookResultStore(fx_name) extra_info = {} if self.has_split_and_opt_idx: extra_info = self.extra_info # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) hook_result.detach() if self.trainer.move_metrics_to_cpu: hook_result.cpu() self._internals[fx_name].append( hook_result, dataloader_idx=dataloader_idx, extra_info=extra_info) # update logged_metrics, progress_bar_metrics, callback_metrics self.update_logger_connector(fx_name) # reset _results, fx_name self.reset_model()
def cache_result(self) -> None: """ This function is called after every hook and store the result object """ with self.trainer.profiler.profile("cache_result"): model_ref = self.trainer.lightning_module # extract hook results hook_result = model_ref._results if len(hook_result) == 1: model_ref._current_hook_fx_name = None model_ref._current_fx_name = '' return info = self.info fx_name = info["fx_name"] all_gather_fn = self.trainer.lightning_module.all_gather self._internals.setdefault( fx_name, HookResultStore(fx_name, all_gather_fn, self._should_warn)) # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) hook_result = hook_result.detach() if self.trainer.move_metrics_to_cpu: hook_result = hook_result.cpu() elif self.trainer._distrib_type == DistributedType.DP: hook_result = hook_result.to( torch.device("cuda", self.trainer.root_gpu)) self._internals[fx_name].append(hook_result, info) # update logged_metrics, progress_bar_metrics, callback_metrics if "epoch_end" in fx_name: self.update_logger_connector() self.reset_model()
def cache_result(self) -> None: """ This function is called after every hook and store the result object """ with self.trainer.profiler.profile("cache_result"): model_ref = self.trainer.get_model() # extract hook results hook_result = model_ref._results if len(hook_result) == 1: model_ref._current_hook_fx_name = None model_ref._current_fx_name = '' return # extract model information fx_name, dataloader_idx = self.current_model_info() self._internals.setdefault(fx_name, HookResultStore(fx_name)) extra_info = self.extra_info if self.has_split_and_opt_idx else {} # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) hook_result.detach() if self.trainer.move_metrics_to_cpu: hook_result.cpu() elif self.trainer.use_dp: hook_result.to(torch.device("cuda", self.trainer.root_gpu)) self._internals[fx_name].append(hook_result, dataloader_idx=dataloader_idx, extra_info=extra_info) # update logged_metrics, progress_bar_metrics, callback_metrics if "epoch_end" in fx_name: self.update_logger_connector() self.reset_model()
def test_result_metric_integration(): metric_a = DummyMetric() metric_b = DummyMetric() metric_c = DummyMetric() result = Result() for epoch in range(3): cumulative_sum = 0 for i in range(5): metric_a(i) metric_b(i) metric_c(i) cumulative_sum += i result.log('a', metric_a, on_step=True, on_epoch=True) result.log('b', metric_b, on_step=False, on_epoch=True) result.log('c', metric_c, on_step=True, on_epoch=False) batch_log = result.get_batch_log_metrics() batch_expected = {"a_step": i, "a": i, "c": i} assert set(batch_log.keys()) == set(batch_expected.keys()) for k in batch_expected.keys(): assert batch_expected[k] == batch_log[k] epoch_log = result.get_epoch_log_metrics() result.reset() # assert metric state reset to default values assert metric_a.x == metric_a._defaults['x'] assert metric_b.x == metric_b._defaults['x'] assert metric_c.x == metric_c._defaults['x'] epoch_expected = {"b": cumulative_sum, "a_epoch": cumulative_sum} assert set(epoch_log.keys()) == set(epoch_expected.keys()) for k in epoch_expected.keys(): assert epoch_expected[k] == epoch_log[k]
def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging model_ref = self.trainer.get_model() with self.trainer.profiler.profile("model_forward"): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) # manually capture logged metrics model_ref._current_fx_name = 'training_step' model_ref._results = Result() training_step_output = self.trainer.accelerator_backend.training_step(args) self.trainer.logger_connector.cache_logged_metrics() self._check_training_step_output(training_step_output) training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( training_step_output, split_batch ) is_result_obj = isinstance(training_step_output, Result) if training_step_output_for_epoch_end is None: return None # enable empty loss when using manual opt closure_loss = None untouched_loss = None if self.trainer.train_loop.automatic_optimization: # accumulate loss # (if accumulate_grad_batches = 1 no effect) if is_result_obj: closure_loss = training_step_output.minimize else: closure_loss = training_step_output.batch_loss closure_loss = closure_loss / self.trainer.accumulate_grad_batches # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() # result result = AttributeDict( closure_loss=closure_loss, loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end=training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
def _ddp_test_fn(rank, worldsize): _setup_ddp(rank, worldsize) tensor = torch.tensor([1.0]) metric_a = DummyMetric() metric_b = DummyMetric() metric_c = DummyMetric() # ddp_sync_on_step is False by default result = Result() for epoch in range(3): cumulative_sum = 0 for i in range(5): metric_a(i) metric_b(i) metric_c(i) cumulative_sum += i result.log('a', metric_a, on_step=True, on_epoch=True) result.log('b', metric_b, on_step=False, on_epoch=True) result.log('c', metric_c, on_step=True, on_epoch=False) batch_log = result.get_batch_log_metrics() batch_expected = {"a_step": i, "a": i, "c": i} assert set(batch_log.keys()) == set(batch_expected.keys()) for k in batch_expected.keys(): assert batch_expected[k] == batch_log[k] epoch_log = result.get_epoch_log_metrics() # assert metric state reset to default values assert metric_a.x == metric_a._defaults['x'] assert metric_b.x == metric_b._defaults['x'] assert metric_c.x == metric_c._defaults['x'] epoch_expected = { "b": cumulative_sum * worldsize, "a": cumulative_sum * worldsize, "a_epoch": cumulative_sum * worldsize } assert set(epoch_log.keys()) == set(epoch_expected.keys()) for k in epoch_expected.keys(): assert epoch_expected[k] == epoch_log[k]
def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): model = self.trainer.get_model() # reset results model._results = Result() # with a single dataloader don't pass an array outputs = self.outputs eval_results = outputs if num_dataloaders == 1: eval_results = outputs[0] user_reduced = False if self.testing: if is_overridden('test_epoch_end', model=model): model._current_fx_name = 'test_epoch_end' if using_eval_result: eval_results = self.__gather_epoch_end_eval_results( outputs) eval_results = model.test_epoch_end(eval_results) user_reduced = True else: if is_overridden('validation_epoch_end', model=model): model._current_fx_name = 'validation_epoch_end' if using_eval_result: eval_results = self.__gather_epoch_end_eval_results( outputs) eval_results = model.validation_epoch_end(eval_results) user_reduced = True # depre warning if eval_results is not None and user_reduced: step = 'testing_epoch_end' if self.testing else 'validation_epoch_end' self.warning_cache.warn( f'The {step} should not return anything as of 9.1.' ' To log, use self.log(...) or self.write(...) directly in the LightningModule' ) if using_eval_result and not user_reduced: eval_results = self.__auto_reduce_result_objs(outputs) if not isinstance(eval_results, list): eval_results = [eval_results] return eval_results
def test_result_gather_stack(): """ Test that tensors get concatenated when they all have the same shape. """ outputs = [ { "foo": torch.zeros(4, 5) }, { "foo": torch.zeros(4, 5) }, { "foo": torch.zeros(4, 5) }, ] result = Result.gather(outputs) assert isinstance(result["foo"], torch.Tensor) assert list(result["foo"].shape) == [12, 5]
def test_result_gather_concatenate(): """ Test that tensors get concatenated when they have varying size in first dimension. """ outputs = [ { "foo": torch.zeros(4, 5) }, { "foo": torch.zeros(8, 5) }, { "foo": torch.zeros(3, 5) }, ] result = Result.gather(outputs) assert isinstance(result["foo"], torch.Tensor) assert list(result["foo"].shape) == [15, 5]
def test_result_gather_scalar(): """ Test that 0-dim tensors get gathered and stacked correctly. """ outputs = [ { "foo": torch.tensor(1) }, { "foo": torch.tensor(2) }, { "foo": torch.tensor(3) }, ] result = Result.gather(outputs) assert isinstance(result["foo"], torch.Tensor) assert list(result["foo"].shape) == [3]
def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): # reset the result of the PL module model = self.trainer.get_model() model._results = Result() model._current_fx_name = 'evaluation_step' # set dataloader_idx and track batch_size self.trainer.logger_connector.on_evaluation_batch_start( self.testing, batch, dataloader_idx, self.num_dataloaders) if self.testing: self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) else: self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx)
def log_metrics(self, result: Result, step_results: Dict[str, float], step_type: str) -> None: result.log(f"loss/{step_type}", step_results["loss"]) for task in ["verb", "noun"]: result.log(f"{task}_loss/{step_type}", step_results[f"{task}_loss"]) for k in (1, 5): result.log( f"{task}_accuracy@{k}/{step_type}", step_results[f"{task}_accuracy@{k}"], )
def test_result_gather_mixed_types(): """ Test that a collection of mixed types gets gathered into a list. """ outputs = [ { "foo": 1.2 }, { "foo": ["bar", None] }, { "foo": torch.tensor(1) }, ] result = Result.gather(outputs) expected = [1.2, ["bar", None], torch.tensor(1)] assert isinstance(result["foo"], list) assert result["foo"] == expected
def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging model = self.trainer.get_model() model._results = Result() model._current_fx_name = 'training_step' with self.trainer.profiler.profile('model_forward'): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) training_step_output = self.trainer.accelerator_backend.training_step( args) training_step_output = self.trainer.call_hook( 'training_step_end', training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( training_step_output, split_batch) is_result_obj = isinstance(training_step_output, Result) if training_step_output_for_epoch_end is None: return None # accumulate loss # (if accumulate_grad_batches = 1 no effect) if is_result_obj: closure_loss = training_step_output.minimize else: closure_loss = training_step_output.batch_loss closure_loss = closure_loss / self.trainer.accumulate_grad_batches # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() # result result = AttributeDict( closure_loss=closure_loss, loss=untouched_loss, training_step_output=training_step_output, training_step_output_for_epoch_end= training_step_output_for_epoch_end, hiddens=training_step_output.hiddens, ) return result
def test_sample_metadata_field() -> None: """ Test that the string constant we use to identify the metadata field is really matching the field name in SampleWithMetadata """ batch_size = 5 xyz = (6, 7, 8) shape = (batch_size, ) + xyz zero = torch.zeros(shape) s = Sample(metadata=DummyPatientMetadata, image=zero, mask=zero, labels=torch.zeros((batch_size, ) + (2, ) + xyz)) fields = vars(s) assert len(fields) == 4 assert SAMPLE_METADATA_FIELD in fields # Lightning attempts to determine the batch size by trying to find a tensor field in the sample. # This only works if any field other than Metadata is first. assert Result.unpack_batch_size(fields) == batch_size
def evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Optional[STEP_OUTPUT]: # configure step_kwargs step_kwargs = self._build_kwargs(batch, batch_idx, dataloader_idx) model_ref = self.trainer.lightning_module model_ref._results = Result() if self.trainer.testing: model_ref._current_fx_name = "test_step" with self.trainer.profiler.profile("test_step"): output = self.trainer.accelerator.test_step(step_kwargs) else: model_ref._current_fx_name = "validation_step" with self.trainer.profiler.profile("validation_step"): output = self.trainer.accelerator.validation_step(step_kwargs) # capture any logged information self.trainer.logger_connector.cache_logged_metrics() # track batch size for weighted average if isinstance(output, Result): output.track_batch_size(batch) return output
def on_train_split_start(self, split_idx: int, opt_idx: int, split_batch) -> None: self.cached_results._split_idx = split_idx self.cached_results._opt_idx = opt_idx self.cached_results._batch_size = Result.extract_batch_size(split_batch)
def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders): model = self.trainer.get_model() # set dataloader_idx only if multiple ones model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None # track batch_size self.cached_results._batch_size = Result.extract_batch_size(batch)
def check_dataloader_idx(self, result: Result) -> bool: random_key = list(result.keys())[-1] return result["meta"][random_key]["dataloader_idx"] is not None
def test_result_retrieve_last_logged_item(): result = Result() result.log('a', 5., on_step=True, on_epoch=True) assert result['a_epoch'] == 5. assert result['a_step'] == 5. assert result['a'] == 5.
def _reset_result_and_set_hook_fx_name(self, hook_name): model_ref = self.get_model() if model_ref is not None: # used to track current hook name called model_ref._results = Result() model_ref._current_hook_fx_name = hook_name
def check_dataloader_idx(self, result: Result) -> bool: random_key = [*result.keys()][-1] add_dataloader_idx = result["meta"][random_key][ "dataloader_idx"] is not None return add_dataloader_idx