def _test_distrib_average(device): with pytest.raises(NotComputableError): v = Average(device=device) v.compute() mean_var = Average(device=device) y_true = torch.rand(100, dtype=torch.float64) + torch.randint( 0, 10, size=(100, )).double() y_true = y_true.to(device) for y in y_true: mean_var.update(y) m = mean_var.compute() y_true = idist.all_reduce(y_true) assert m.item() == pytest.approx(y_true.mean().item() / idist.get_world_size()) mean_var = Average(device=device) y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint( 0, 10, size=(100, 10)).double() y_true = y_true.to(device) for y in y_true: mean_var.update(y) m = mean_var.compute() y_true = idist.all_reduce(y_true) np.testing.assert_almost_equal(m.cpu().numpy(), y_true.mean(dim=0).cpu().numpy() / idist.get_world_size(), decimal=5)
def _test(metric_device): with pytest.raises(NotComputableError): v = GeometricAverage(device=metric_device) v.compute() decimal = 5 if device.type != "xla" else 4 mean_var = GeometricAverage(device=metric_device) y_true = torch.rand(100, dtype=torch.float64) + torch.randint(0, 10, size=(100,)).double() y_true = y_true.to(device) for y in y_true: mean_var.update(y) m = mean_var.compute() log_y_true = torch.log(y_true) log_y_true = idist.all_reduce(log_y_true) np.testing.assert_almost_equal( m, torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).item(), decimal=decimal ) mean_var = GeometricAverage(device=metric_device) y_true = torch.rand(100, 10, dtype=torch.float64) + torch.randint(0, 10, size=(100, 10)).double() y_true = y_true.to(device) for y in y_true: mean_var.update(y) m = mean_var.compute() log_y_true = torch.log(y_true) log_y_true = idist.all_reduce(log_y_true) np.testing.assert_almost_equal( m.cpu().numpy(), torch.exp(log_y_true.mean(dim=0) / idist.get_world_size()).cpu().numpy(), decimal=decimal )
def _test_distrib_variable_accumulation(device): mean_var = VariableAccumulation(lambda a, x: a + x, device=device) y_true = torch.rand(100, device=device, dtype=torch.float64) for y in y_true: mean_var.update(y) y_true = idist.all_reduce(y_true) a, n = mean_var.compute() assert a.item() == pytest.approx(y_true.sum().item()) assert n == len(y_true) * idist.get_world_size() # check if call compute twice a, n = mean_var.compute() assert a.item() == pytest.approx(y_true.sum().item()) assert n == len(y_true) * idist.get_world_size() mean_var = VariableAccumulation(lambda a, x: a + x, device=device) y_true = torch.rand(50, 10, device=device, dtype=torch.float64) for y in y_true: mean_var.update(y) y_true = idist.all_reduce(y_true) a, n = mean_var.compute() assert n == len(y_true) * idist.get_world_size() np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4) a, n = mean_var.compute() assert n == len(y_true) * idist.get_world_size() np.testing.assert_almost_equal(a.cpu().numpy(), y_true.sum(dim=0).cpu().numpy(), decimal=4)
def compute(self) -> Union[torch.Tensor, float]: is_scalar = not isinstance(self._positives, torch.Tensor) or self._positives.ndim == 0 if is_scalar and self._positives == 0: raise NotComputableError( f"{self.__class__.__name__} must have at least one example before it can be computed." ) if not self._is_reduced: if not (self._type == "multilabel" and not self._average): self._true_positives = idist.all_reduce( self._true_positives) # type: ignore[assignment] self._positives = idist.all_reduce( self._positives) # type: ignore[assignment] else: self._true_positives = cast( torch.Tensor, idist.all_gather(self._true_positives)) self._positives = cast(torch.Tensor, idist.all_gather(self._positives)) self._is_reduced = True # type: bool result = self._true_positives / (self._positives + self.eps) if self._average: return cast(torch.Tensor, result).mean().item() else: return result
def _test_distrib_all_gather(device): res = torch.tensor(idist.all_gather(10), device=device) true_res = torch.tensor([ 10, ] * idist.get_world_size(), device=device) assert (res == true_res).all() t = torch.tensor(idist.get_rank(), device=device) res = idist.all_gather(t) true_res = torch.tensor([i for i in range(idist.get_world_size())], device=device) assert (res == true_res).all() x = "test-test" if idist.get_rank() == 0: x = "abc" res = idist.all_gather(x) true_res = [ "abc", ] + ["test-test"] * (idist.get_world_size() - 1) assert res == true_res base_x = "x" * 1026 x = base_x if idist.get_rank() == 0: x = "abc" if idist.get_rank() > 0: with pytest.warns( UserWarning, match=r"is larger than 1024 and thus will be truncated"): res = idist.all_gather(x) else: res = idist.all_gather(x) true_res = [ "abc", ] + [base_x[:1024]] * (idist.get_world_size() - 1) assert res == true_res t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1) in_dtype = t.dtype res = idist.all_gather(t) assert res.shape == (idist.get_world_size() * 4, 25) assert res.dtype == in_dtype true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device) for i in range(idist.get_world_size()): true_res[i * 4:(i + 1) * 4, ...] = torch.arange( 100, device=device).reshape(4, 25) * (i + 1) assert (res == true_res).all() if idist.get_world_size() > 1: with pytest.raises(TypeError, match=r"Unhandled input type"): idist.all_reduce([0, 1, 2])
def _test_distrib_all_gather(device): res = torch.tensor(idist.all_gather(10), device=device) true_res = torch.tensor([ 10, ] * idist.get_world_size(), device=device) assert (res == true_res).all() t = torch.tensor(idist.get_rank(), device=device) res = idist.all_gather(t) true_res = torch.tensor([i for i in range(idist.get_world_size())], device=device) assert (res == true_res).all() x = "test-test" if idist.get_rank() == 0: x = "abc" res = idist.all_gather(x) true_res = [ "abc", ] + ["test-test"] * (idist.get_world_size() - 1) assert res == true_res base_x = "tests/ignite/distributed/utils/test_native.py" * 2000 x = base_x if idist.get_rank() == 0: x = "abc" res = idist.all_gather(x) true_res = [ "abc", ] + [base_x] * (idist.get_world_size() - 1) assert res == true_res t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1) in_dtype = t.dtype res = idist.all_gather(t) assert res.shape == (idist.get_world_size() * 4, 25) assert res.dtype == in_dtype true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device) for i in range(idist.get_world_size()): true_res[i * 4:(i + 1) * 4, ...] = torch.arange( 100, device=device).reshape(4, 25) * (i + 1) assert (res == true_res).all() if idist.get_world_size() > 1: with pytest.raises(TypeError, match=r"Unhandled input type"): idist.all_reduce([0, 1, 2])
def _test_distrib_log_lr_and_loss(device): from ignite.handlers import ParamScheduler lr_finder = FastaiLRFinder() _lr_schedule = MagicMock(spec=ParamScheduler) # minimal setup for lr_finder to make _log_lr_and_loss work rank = idist.get_rank() loss = 0.01 * (rank + 1) engine = Engine(lambda e, b: None) engine.state.output = loss engine.state.iteration = 1 lr_finder._lr_schedule = _lr_schedule lr_finder._history["loss"] = [] lr_finder._history["lr"] = [] lr_finder._log_lr_and_loss(engine, output_transform=lambda x: x, smooth_f=0.1, diverge_th=10.0) expected_loss = idist.all_reduce(loss) assert pytest.approx(lr_finder._history["loss"][-1]) == expected_loss
def another_wrapper(self: Metric, *args: Any, **kwargs: Any) -> Callable: if not isinstance(self, Metric): raise RuntimeError( "Decorator sync_all_reduce should be used on ignite.metric.Metric class methods only" ) ws = idist.get_world_size() if len(attrs) > 0 and not self._is_reduced: if ws > 1: for attr in attrs: op_kwargs = {} if ":" in attr: attr, op = attr.split(":") valid_ops = ["MIN", "MAX", "SUM", "PRODUCT"] if op not in valid_ops: raise ValueError( f"Reduction operation is not valid (expected : {valid_ops}, got: {op}" ) op_kwargs["op"] = op t = getattr(self, attr, None) if t is not None: t = idist.all_reduce(t, **op_kwargs) self._is_reduced = True setattr(self, attr, t) else: self._is_reduced = True return func(self, *args, **kwargs)
def _dist_geom_mean(y_true): log_y_true = torch.log(y_true) log_y_true = idist.all_reduce(log_y_true) if len(log_y_true.shape) > 2: log_y_true = log_y_true.reshape(-1, log_y_true.shape[-1]) np_t = log_y_true.cpu().numpy() return np.exp(np.mean(np_t, axis=0) / idist.get_world_size())
def compute(self): is_scalar = not isinstance(self._positives, torch.Tensor) or self._positives.ndim == 0 if is_scalar and self._positives == 0: return -1 if not (self._type == "multilabel" and not self._average): if not self._is_reduced: self._true_positives = idist.all_reduce(self._true_positives) # type: ignore[assignment] self._positives = idist.all_reduce(self._positives) # type: ignore[assignment] self._is_reduced = True # type: bool result = self._true_positives / (self._positives + self.eps) if self._average: return cast(torch.Tensor, result).mean().item() else: return result
def compute(self) -> Union[torch.Tensor, float]: if not (isinstance(self._positives, torch.Tensor) or self._positives > 0): raise NotComputableError("{} must have at least one example before" " it can be computed.".format( self.__class__.__name__)) if not (self._type == "multilabel" and not self._average): if not self._is_reduced: self._true_positives = idist.all_reduce(self._true_positives) self._positives = idist.all_reduce(self._positives) self._is_reduced = True result = self._true_positives / (self._positives + self.eps) if self._average: return result.mean().item() else: return result
def _test_distrib_barrier(device): t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float) true_res = sum([i for i in range(idist.get_world_size())]) if idist.get_rank() == 0: t += 10.0 idist.barrier() tt = idist.all_reduce(t) assert tt.item() == true_res + 10.0
def training(local_rank, config, **kwargs): import time time.sleep(idist.get_rank() * 0.1) print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}") t = torch.tensor([idist.get_rank()], device=idist.device()) t = idist.all_reduce(t) t = t.item() ws = idist.get_world_size() assert t == ws * (ws - 1) / 2, f"{t} vs {ws}" assert local_rank == idist.get_local_rank()
def another_wrapper(self: Metric, *args, **kwargs) -> Callable: if not isinstance(self, Metric): raise RuntimeError( "Decorator sync_all_reduce should be used on ignite.metric.Metric class methods only" ) if len(attrs) > 0 and not self._is_reduced: for attr in attrs: t = getattr(self, attr, None) if t is not None and idist.get_world_size() > 1: t = idist.all_reduce(t) self._is_reduced = True setattr(self, attr, t) return func(self, *args, **kwargs)
def _log_lr_and_loss(self, trainer: Engine, output_transform: Callable, smooth_f: float, diverge_th: float) -> None: output = trainer.state.output loss = output_transform(output) loss = idist.all_reduce(loss) lr = self._lr_schedule.get_param() # type: ignore[union-attr] self._history["lr"].append(lr) if trainer.state.iteration == 1: self._best_loss = loss else: if smooth_f > 0: loss = smooth_f * loss + (1 - smooth_f) * self._history["loss"][-1] if loss < self._best_loss: self._best_loss = loss self._history["loss"].append(loss) # Check if the loss has diverged; if it has, stop the trainer if self._history["loss"][-1] > diverge_th * self._best_loss: # type: ignore[operator] self._diverge_flag = True self.logger.info("Stopping early, the loss has diverged") trainer.terminate()
def _test_distrib_all_reduce(device): res = idist.all_reduce(10) assert res == 10 * idist.get_world_size() t = torch.tensor(10, device=device) res = idist.all_reduce(t) assert res.item() == 10 * idist.get_world_size() t = torch.tensor(idist.get_rank(), device=device) res = idist.all_reduce(t) assert res.item() == sum([i for i in range(idist.get_world_size())]) if idist.get_world_size() > 1: with pytest.raises(TypeError, match=r"Unhandled input type"): idist.all_reduce("abc") with pytest.raises(ValueError, match=r"Unsupported reduction operation"): idist.all_reduce(10, op="ABC")
def training(local_rank, config, **kwargs): import time time.sleep(idist.get_rank() * 0.1) print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}") t = torch.tensor([idist.get_rank()], device=idist.device()) t = idist.all_reduce(t) t = t.item() ws = idist.get_world_size() assert t == ws * (ws - 1) / 2, f"{t} vs {ws}" assert local_rank == idist.get_local_rank() # Test init method: if idist.model_name() == "native-dist": from ignite.distributed.utils import _model true_init_method = config.get("true_init_method", None) assert true_init_method is not None, true_init_method assert _model._init_method == true_init_method
def _log_lr_and_loss(self, trainer: Engine, output_transform: Callable, smooth_f: float, diverge_th: float) -> None: output = trainer.state.output loss = output_transform(output) if not isinstance(loss, float): if isinstance(loss, torch.Tensor): if (loss.ndimension() == 0) or (loss.ndimension() == 1 and len(loss) == 1): loss = loss.item() else: raise ValueError( "if output of the engine is torch.Tensor, then " "it must be 0d torch.Tensor or 1d torch.Tensor with 1 element, " f"but got torch.Tensor of shape {loss.shape}") else: raise TypeError( "output of the engine should be of type float or 0d torch.Tensor " "or 1d torch.Tensor with 1 element, " f"but got output of type {type(loss).__name__}") loss = idist.all_reduce(loss) lr = self._lr_schedule.get_param() # type: ignore[union-attr] self._history["lr"].append(lr) if trainer.state.iteration == 1: self._best_loss = loss else: if smooth_f > 0: loss = smooth_f * loss + (1 - smooth_f) * self._history["loss"][-1] if loss < self._best_loss: self._best_loss = loss self._history["loss"].append(loss) # Check if the loss has diverged; if it has, stop the trainer if self._history["loss"][ -1] > diverge_th * self._best_loss: # type: ignore[operator] self._diverge_flag = True self.logger.info("Stopping early, the loss has diverged") trainer.terminate()
def _test_distrib_all_reduce(device): res = idist.all_reduce(10) assert res == 10 * idist.get_world_size() t = torch.tensor(10, device=device) res = idist.all_reduce(t) assert res.item() == 10 * idist.get_world_size() rank = idist.get_rank() t = torch.tensor(rank * 2.0 + 1.0, device=device) res = idist.all_reduce(t) assert res.item() == sum( [i * 2.0 + 1.0 for i in range(idist.get_world_size())]) t = torch.tensor(rank * 2.0 + 1.0, device=device) res = idist.all_reduce(t, "MIN").item() true_val = min([i * 2 + 1 for i in range(idist.get_world_size())]) assert res == true_val, f"{res} vs {true_val}" t = torch.tensor(rank * 2.0 + 1.0, device=device) res = idist.all_reduce(t, "MAX").item() true_val = max([i * 2.0 + 1.0 for i in range(idist.get_world_size())]) assert res == true_val, f"{res} vs {true_val}" t = torch.tensor(rank * 2.0 + 1.0, device=device) res = idist.all_reduce(t, "PRODUCT").item() true_val = 1 for v in [i * 2.0 + 1.0 for i in range(idist.get_world_size())]: true_val *= v assert res == true_val, f"{res} vs {true_val}" if idist.get_world_size() > 1: with pytest.raises(TypeError, match=r"Unhandled input type"): idist.all_reduce("abc") with pytest.raises(ValueError, match=r"Unsupported reduction operation"): idist.all_reduce(10, op="ABC") t = torch.tensor([0, 1, 2]) res = idist.all_reduce(t) assert res.device == t.device, f"{res.device} vs {t.device}"
def _dist_mean(y_true): y_true = idist.all_reduce(y_true) / idist.get_world_size() if len(y_true.shape) > 2: y_true = y_true.reshape(-1, y_true.shape[-1]) return y_true.mean(dim=0).cpu().numpy()
def score_function(engine): i = trainer.state.epoch - 1 v = scores[i] idist.all_reduce(v) v /= idist.get_world_size() return v.item()
def test_idist_all_reduce_no_dist(): assert idist.all_reduce(10) == 10
def _mean(y_true): y_true = idist.all_reduce(y_true) return y_true.mean(dim=0).cpu().numpy() / idist.get_world_size()
def _geom_mean(y_true): log_y_true = torch.log(y_true) log_y_true = idist.all_reduce(log_y_true) np_t = log_y_true.cpu().numpy() return np.exp(np.mean(np_t, axis=0) / idist.get_world_size())