def _all_gather(engine): scores = engine.state.metric_details["metric3"] engine.state.metric_details[ "metric3"] = evenly_divisible_all_gather(data=scores) scores = engine.state.metric_details["metric4"] engine.state.metric_details[ "metric4"] = evenly_divisible_all_gather(data=scores)
def _run(self): if dist.get_rank() == 0: data1 = torch.tensor([[1, 2], [3, 4]]) data2 = torch.tensor([[1.0, 2.0]]) if dist.get_rank() == 1: data1 = torch.tensor([[5, 6]]) data2 = torch.tensor([[3.0, 4.0], [5.0, 6.0]]) result1 = evenly_divisible_all_gather(data=data1) torch.testing.assert_allclose(result1, torch.tensor([[1, 2], [3, 4], [5, 6]])) result2 = evenly_divisible_all_gather(data=data2) torch.testing.assert_allclose( result2, torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]))
def _finalize(self, engine: Engine) -> None: """ All gather classification results from ranks and save to CSV file. Args: engine: Ignite Engine, it can be a trainer, validator or evaluator. """ ws = idist.get_world_size() if self.save_rank >= ws: raise ValueError( "target save rank is greater than the distributed group size.") outputs = torch.cat(self._outputs, dim=0) filenames = self._filenames if ws > 1: outputs = evenly_divisible_all_gather(outputs) filenames = string_list_all_gather(filenames) if len(filenames) == 0: meta_dict = None else: if len(filenames) != len(outputs): warnings.warn( f"filenames length: {len(filenames)} doesn't match outputs length: {len(outputs)}." ) meta_dict = {Key.FILENAME_OR_OBJ: filenames} # save to CSV file only in the expected rank if idist.get_rank() == self.save_rank: saver = CSVSaver(self.output_dir, self.filename, self.overwrite) saver.save_batch(outputs, meta_dict) saver.finalize()
def compute(self) -> Any: """ Raises: NotComputableError: When ``compute`` is called before an ``update`` occurs. """ _scores = torch.cat(self._scores, dim=0) ws = idist.get_world_size() if ws > 1 and not self._is_reduced: # all gather across all processes _scores = evenly_divisible_all_gather(data=_scores) self._is_reduced = True # save score of every image into engine.state for other components if self.save_details: if self._engine is None or self._name is None: raise RuntimeError("please call the attach() function to connect expected engine first.") self._engine.state.metric_details[self._name] = _scores result: torch.Tensor = torch.zeros(1) if idist.get_rank() == 0: # run compute_fn on zero rank only result = self._reduce(_scores) if ws > 1: # broadcast result to all processes result = idist.broadcast(result, src=0) return result.item() if isinstance(result, torch.Tensor) else result
def compute(self) -> Any: _prediction_tensor = torch.cat(self._predictions, dim=0) _target_tensor = torch.cat(self._targets, dim=0) ws = idist.get_world_size() if ws > 1 and not self._is_reduced: # All gather across all processes _prediction_tensor = evenly_divisible_all_gather( _prediction_tensor) _target_tensor = evenly_divisible_all_gather(_target_tensor) self._is_reduced = True result: torch.Tensor = torch.zeros(1) if idist.get_rank() == 0: # Run compute_fn on zero rank only result = self.compute_fn(_prediction_tensor, _target_tensor) if ws > 1: # broadcast result to all processes result = idist.broadcast(result, src=0) return result.item() if torch.is_tensor(result) else result
def __call__(self, engine: Engine) -> None: """ This method assumes self.batch_transform will extract metadata from the input batch. Args: engine: Ignite Engine, it can be a trainer, validator or evaluator. """ _meta_data = self.batch_transform(engine.state.batch) if Key.FILENAME_OR_OBJ in _meta_data: # all gather filenames across ranks _meta_data[Key.FILENAME_OR_OBJ] = string_list_all_gather( _meta_data[Key.FILENAME_OR_OBJ]) # all gather predictions across ranks _engine_output = evenly_divisible_all_gather( self.output_transform(engine.state.output)) if self._expected_rank: self.saver.save_batch(_engine_output, _meta_data)