def _test_distrib_compute(device): rank = idist.get_rank() torch.manual_seed(12) def _test(metric_device): metric_device = torch.device(metric_device) m = GeometricMeanRelativeAbsoluteError(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.rand(size=(100,), device=device) y = torch.rand(size=(100,), device=device) m.update((y_pred, y)) y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y = y.cpu().numpy() np_y_pred = y_pred.cpu().numpy() np_gmrae = np.exp(np.log(np.abs(np_y - np_y_pred) / np.abs(np_y - np_y.mean())).mean()) assert m.compute() == pytest.approx(np_gmrae, rel=1e-4) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def __init__(self, logger: TrainsLogger = None, output_uri: str = None, dirname: str = None, *args, **kwargs): self._setup_check_trains(logger, output_uri) if not dirname: dirname = "" if idist.get_rank() == 0: dirname = tempfile.mkdtemp( prefix="ignite_checkpoints_{}".format( datetime.now().strftime("%Y_%m_%d_%H_%M_%S_"))) if idist.get_world_size() > 1: dirname = idist.all_gather(dirname)[0] warnings.warn( "TrainsSaver created a temporary checkpoints directory: {}". format(dirname)) idist.barrier() # Let's set non-atomic tmp dir saving behaviour if "atomic" not in kwargs: kwargs["atomic"] = False super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)
def _test_distrib_compute(device): rank = idist.get_rank() manhattan = DistanceMetric.get_metric("manhattan") def _test(metric_device): metric_device = torch.device(metric_device) m = ManhattanDistance(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10,), device=device).float() y = torch.randint(0, 10, size=(10,), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() assert manhattan.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(res) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def test_no_distrib(capsys): assert idist.backend() is None if torch.cuda.is_available(): assert idist.device().type == "cuda" else: assert idist.device().type == "cpu" assert idist.get_rank() == 0 assert idist.get_world_size() == 1 assert idist.get_local_rank() == 0 assert idist.model_name() == "serial" from ignite.distributed.utils import _model, _SerialModel _sanity_check() assert isinstance(_model, _SerialModel) idist.show_config() captured = capsys.readouterr() out = captured.err.split("\r") out = list(map(lambda x: x.strip(), out)) out = list(filter(None, out)) assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[ -1] assert "ignite.distributed.utils INFO: backend: None" in out[-1] if torch.cuda.is_available(): assert "ignite.distributed.utils INFO: device: cuda" in out[-1] else: assert "ignite.distributed.utils INFO: device: cpu" in out[-1] assert "ignite.distributed.utils INFO: rank: 0" in out[-1] assert "ignite.distributed.utils INFO: local rank: 0" in out[-1] assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
def __init__( self, logger: Optional[ClearMLLogger] = None, output_uri: Optional[str] = None, dirname: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: self._setup_check_clearml(logger, output_uri) if not dirname: dirname = "" if idist.get_rank() == 0: dirname = tempfile.mkdtemp(prefix=f"ignite_checkpoints_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')}") if idist.get_world_size() > 1: dirname = idist.all_gather(dirname)[0] # type: ignore[index, assignment] warnings.warn(f"ClearMLSaver created a temporary checkpoints directory: {dirname}") idist.barrier() # Let's set non-atomic tmp dir saving behaviour if "atomic" not in kwargs: kwargs["atomic"] = False self._checkpoint_slots = defaultdict(list) # type: DefaultDict[Union[str, Tuple[str, str]], List[Any]] super(ClearMLSaver, self).__init__(dirname=dirname, *args, **kwargs) # type: ignore[misc]
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None): assert idist.backend() == backend, f"{idist.backend()} vs {backend}" this_device = idist.device() assert isinstance(this_device, torch.device) if backend in ("nccl", "horovod") and "cuda" in this_device.type: true_device = torch.device(f"{true_device}:{local_rank}") assert this_device == true_device, f"{this_device} vs {true_device}" elif backend in ("gloo", "horovod"): assert this_device == torch.device(true_device) elif backend == "xla-tpu": assert true_device in this_device.type if rank is None: if idist.model_name() == "native-dist": rank = dist.get_rank() if rank is not None: assert idist.get_rank() == rank assert idist.get_world_size() == ws assert idist.get_local_rank() == local_rank assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist") _sanity_check()
def run_evaluation(config_filepath, backend="nccl", with_clearml=True): """Main entry to run model's evaluation: - compute validation metrics Args: config_filepath (str): evaluation configuration .py file backend (str): distributed backend: nccl, gloo, horovod or None to run without distributed config with_clearml (bool): if True, uses ClearML as experiment tracking system """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled torch.backends.cudnn.benchmark = True config_filepath = Path(config_filepath) assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" with idist.Parallel(backend=backend) as parallel: logger = setup_logger(name="Pascal-VOC12 Evaluation", distributed_rank=idist.get_rank()) config = ConfigObject(config_filepath) InferenceConfigSchema.validate(config) config.script_filepath = Path(__file__) output_path = setup_experiment_tracking(config, with_clearml=with_clearml, task_type="testing") config.output_path = output_path utils.log_basic_info(logger, get_params(config, InferenceConfigSchema)) try: parallel.run(evaluation, config, logger=logger, with_clearml=with_clearml) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def _test_idist_methods_overhead(ok_factor, sync_model): import time import horovod.torch as hvd if sync_model: idist.sync() from ignite.distributed.utils import _model from ignite.distributed.comp_models.horovod import _HorovodDistModel assert isinstance(_model, _HorovodDistModel) n = 100000 m = 5 t2 = 0.0 t1 = 0.0 for j in range(m): start = time.time() for _ in range(n): _ = hvd.size() _ = hvd.rank() elapsed = time.time() - start t2 += elapsed / n / m start = time.time() for _ in range(n): _ = idist.get_world_size() _ = idist.get_rank() elapsed = time.time() - start t1 += elapsed / n / m overhead_factor = t1 / t2 assert overhead_factor < ok_factor, "{} vs {} | {} vs {}".format( overhead_factor, ok_factor, t2, t1)
def _test_distrib_compute(device): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) m = MaximumAbsoluteError(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10,), device=device).float() y = torch.randint(0, 10, size=(10,), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() np_max = np.max(np.abs((np_y_pred - np_y))) assert np_max == pytest.approx(res) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def create_trainer(model, optimizer, criterion, train_sampler, config, logger): prepare_batch = config.prepare_batch device = config.device # Setup trainer accumulation_steps = getattr(config, "accumulation_steps", 1) model_output_transform = getattr(config, "model_output_transform", lambda x: x) def train_update_function(engine, batch): model.train() x, y = prepare_batch(batch, device=device, non_blocking=True) y_pred = model(x) y_pred = model_output_transform(y_pred) loss = criterion(y_pred, y) if isinstance(loss, Mapping): assert "supervised batch loss" in loss loss_dict = loss output = {k: v.item() for k, v in loss_dict.items()} loss = loss_dict["supervised batch loss"] / accumulation_steps else: output = {"supervised batch loss": loss.item()} with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: scaled_loss.backward() if engine.state.iteration % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return output output_names = getattr(config, "output_names", ["supervised batch loss",]) lr_scheduler = config.lr_scheduler trainer = Engine(train_update_function) trainer.logger = logger to_save = {"model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, "trainer": trainer, "amp": amp} save_every_iters = getattr(config, "save_every_iters", 1000) common.setup_common_training_handlers( trainer, train_sampler, to_save=to_save, save_every_iters=save_every_iters, save_handler=get_save_handler(config), lr_scheduler=lr_scheduler, with_gpu_stats=exp_tracking.has_mlflow, output_names=output_names, with_pbars=False, ) if idist.get_rank() == 0: common.ProgressBar(persist=False).attach(trainer, metric_names="all") return trainer
def _test_distrib_metrics_on_diff_devices(device): n_classes = 10 n_iters = 12 s = 16 offset = n_iters * s rank = idist.get_rank() y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(), )).to(device) y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device) def update(engine, i): return ( y_preds[i * s + rank * offset:(i + 1) * s + rank * offset], y_true[i * s + rank * offset:(i + 1) * s + rank * offset], ) precision = Precision(average=False, device="cpu") recall = Recall(average=False, device=device) custom_metric = precision * recall engine = Engine(update) custom_metric.attach(engine, "custom_metric") data = list(range(n_iters)) engine.run(data, max_epochs=2)
def _test_distrib_binary_input_N(device): rank = idist.get_rank() torch.manual_seed(12) def _test(y_pred, y, batch_size, metric_device): metric_device = torch.device(metric_device) roc_auc = ROC_AUC(device=metric_device) torch.manual_seed(10 + rank) roc_auc.reset() roc_auc.update((y_pred, y)) if batch_size > 1: n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size roc_auc.update( (y_pred[idx:idx + batch_size], y[idx:idx + batch_size])) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y = y.cpu().numpy() np_y_pred = y_pred.cpu().numpy() res = roc_auc.compute() assert isinstance(res, float) assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res) def get_test_cases(): test_cases = [ (torch.randint(0, 2, size=(10, )).long(), torch.randint(0, 2, size=(10, )).long(), 1), (torch.randint(0, 2, size=(100, )).long(), torch.randint(0, 2, size=(100, )).long(), 1), (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1), (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 1), # updated batches (torch.randint(0, 2, size=(10, )).long(), torch.randint(0, 2, size=(10, )).long(), 16), (torch.randint(0, 2, size=(100, )).long(), torch.randint(0, 2, size=(100, )).long(), 16), (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 16), (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 16), ] return test_cases for _ in range(3): test_cases = get_test_cases() for y_pred, y, batch_size in test_cases: _test(y_pred, y, batch_size, "cpu") if device.type != "xla": _test(y_pred, y, batch_size, idist.device())
def _test_distrib_integration(device): from ignite.engine import Engine rank = idist.get_rank() torch.manual_seed(12) def _test(metric_device): n_iters = 60 s = 16 offset = n_iters * s n_probabilities = 10 y = torch.rand(offset * idist.get_world_size(), n_probabilities) def update(_, i): return y[i * s + rank * offset:(i + 1) * s + rank * offset, :] engine = Engine(update) m = InceptionScore(num_features=n_probabilities, feature_extractor=torch.nn.Identity(), device=metric_device) m.attach(engine, "InceptionScore") engine.run(data=list(range(n_iters)), max_epochs=1) assert "InceptionScore" in engine.state.metrics assert pytest.approx(calculate_inception_score(y)) == m.compute() metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: _test(metric_device=metric_device)
def _test_distrib_compute(device): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) m = MeanNormalizedBias(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(1, 11, size=(10, ), device=device).float() y = torch.randint(1, 11, size=(10, ), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() np_sum = ((np_y - np_y_pred) / np_y).sum() np_len = len(np_y_pred) np_ans = np_sum / np_len assert np_ans == pytest.approx(res) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def create_evaluator(model, metrics, config, tag="val"): with_amp = config["with_amp"] device = idist.device() @torch.no_grad() def evaluate_step(engine, batch): model.eval() input_batch = batch[0] labels = batch[1].view(-1, 1) if labels.device != device: input_batch = { k: v.to(device, non_blocking=True, dtype=torch.long) for k, v in batch[0].items() } labels = labels.to(device, non_blocking=True, dtype=torch.float) with autocast(enabled=with_amp): output = model(input_batch) return output, labels evaluator = Engine(evaluate_step) for name, metric in metrics.items(): metric.attach(evaluator, name) if idist.get_rank() == 0 and (not config["with_clearml"]): common.ProgressBar(desc=f"Evaluation ({tag})", persist=False).attach(evaluator) return evaluator
def _test_idist_methods_overhead(ok_factor): import time n = 100000 m = 5 t2 = 0.0 t1 = 0.0 for j in range(m): start = time.time() for _ in range(n): _ = dist.get_world_size() _ = dist.get_rank() elapsed = time.time() - start t2 += elapsed / n / m start = time.time() for _ in range(n): _ = idist.get_world_size() _ = idist.get_rank() elapsed = time.time() - start t1 += elapsed / n / m overhead_factor = t1 / t2 assert overhead_factor < ok_factor, "{} vs {} | {} vs {}".format( overhead_factor, ok_factor, t2, t1)
def _test_distrib_compute(device): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) ck_metric = CohenKappa(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(100, 1), device=device) y = torch.randint(0, 2, size=(100, 1), device=device) ck_metric.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() np_ck = cohen_kappa_score(np_y, np_y_pred) res = ck_metric.compute() assert res == pytest.approx(np_ck) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def __init__(self, num_iters=100, prepare_batch=None): from ignite.handlers import Timer device = idist.device() def upload_to_gpu(engine, batch): if prepare_batch is not None: x, y = prepare_batch(batch, device=device, non_blocking=False) self.num_iters = num_iters self.benchmark_dataflow = Engine(upload_to_gpu) @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(once=num_iters)) def stop_benchmark_dataflow(engine): engine.terminate() if idist.get_rank() == 0: @self.benchmark_dataflow.on( Events.ITERATION_COMPLETED(every=num_iters // 100)) def show_progress_benchmark_dataflow(engine): print(".", end=" ") self.timer = Timer(average=False) self.timer.attach( self.benchmark_dataflow, start=Events.EPOCH_STARTED, resume=Events.ITERATION_STARTED, pause=Events.ITERATION_COMPLETED, step=Events.ITERATION_COMPLETED, )
def setup_experiment_tracking(config, with_clearml, task_type="training"): from datetime import datetime assert task_type in ("training", "testing"), task_type output_path = "" if idist.get_rank() == 0: if with_clearml: from clearml import Task schema = TrainvalConfigSchema if task_type == "training" else InferenceConfigSchema task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem, task_type=task_type) task.connect_configuration(config.config_filepath.as_posix()) task.upload_artifact(config.script_filepath.name, config.script_filepath.as_posix()) task.upload_artifact(config.config_filepath.name, config.config_filepath.as_posix()) task.connect(get_params(config, schema)) output_path = Path(os.environ.get("CLEARML_OUTPUT_PATH", "/tmp")) output_path = output_path / "clearml" / datetime.now().strftime("%Y%m%d-%H%M%S") else: import shutil output_path = Path(os.environ.get("OUTPUT_PATH", "/tmp/output-pascal-voc12")) output_path = output_path / task_type / config.config_filepath.stem output_path = output_path / datetime.now().strftime("%Y%m%d-%H%M%S") output_path.mkdir(parents=True, exist_ok=True) shutil.copyfile(config.script_filepath.as_posix(), output_path / config.script_filepath.name) shutil.copyfile(config.config_filepath.as_posix(), output_path / config.config_filepath.name) output_path = output_path.as_posix() return Path(idist.broadcast(output_path, src=0))
def _test_distrib_log_lr_and_loss(device): from ignite.handlers import ParamScheduler lr_finder = FastaiLRFinder() _lr_schedule = MagicMock(spec=ParamScheduler) # minimal setup for lr_finder to make _log_lr_and_loss work rank = idist.get_rank() loss = 0.01 * (rank + 1) engine = Engine(lambda e, b: None) engine.state.output = loss engine.state.iteration = 1 lr_finder._lr_schedule = _lr_schedule lr_finder._history["loss"] = [] lr_finder._history["lr"] = [] lr_finder._log_lr_and_loss(engine, output_transform=lambda x: x, smooth_f=0.1, diverge_th=10.0) expected_loss = idist.all_reduce(loss) assert pytest.approx(lr_finder._history["loss"][-1]) == expected_loss
def compute(self) -> float: if len(self._predictions) < 1 or len(self._targets) < 1: raise NotComputableError( "EpochMetric must have at least one example before it can be computed." ) _prediction_tensor = torch.cat(self._predictions, dim=0) _target_tensor = torch.cat(self._targets, dim=0) ws = idist.get_world_size() if ws > 1 and not self._is_reduced: # All gather across all processes _prediction_tensor = cast(torch.Tensor, idist.all_gather(_prediction_tensor)) _target_tensor = cast(torch.Tensor, idist.all_gather(_target_tensor)) self._is_reduced = True result = 0.0 if idist.get_rank() == 0: # Run compute_fn on zero rank only result = self.compute_fn(_prediction_tensor, _target_tensor) if ws > 1: # broadcast result to all processes result = cast(float, idist.broadcast(result, src=0)) return result
def _test_distrib_compute(device, tol=1e-5): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) m = FractionalBias(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10,), device=device).float() y = torch.randint(0, 10, size=(10,), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() np_sum = (2 * (np_y - np_y_pred) / (np_y_pred + np_y + 1e-30)).sum() np_len = len(y_pred) np_ans = np_sum / np_len assert np_ans == pytest.approx(res, rel=tol) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def create_evaluators(model, metrics, config): model_output_transform = getattr(config, "model_output_transform", lambda x: x) evaluator_args = dict( model=model, metrics=metrics, device=config.device, non_blocking=True, prepare_batch=config.prepare_batch, output_transform=lambda x, y, y_pred: ( model_output_transform(y_pred), y, ), ) train_evaluator = create_supervised_evaluator(**evaluator_args) evaluator = create_supervised_evaluator(**evaluator_args) if idist.get_rank() == 0: common.ProgressBar(desc="Evaluation (train)", persist=False).attach(train_evaluator) common.ProgressBar(desc="Evaluation (val)", persist=False).attach(evaluator) return evaluator, train_evaluator
def create_trainer(model, optimizer, criterion, lr_scheduler, config): # Define any training logic for iteration update def train_step(engine, batch): x, y = batch[0].to(idist.device()), batch[1].to(idist.device()) model.train() y_pred = model(x) loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() return loss.item() # Define trainer engine trainer = Engine(train_step) if idist.get_rank() == 0: # Add any custom handlers @trainer.on(Events.ITERATION_COMPLETED(every=200)) def save_checkpoint(): fp = Path(config.get("output_path", "output")) / "checkpoint.pt" torch.save(model.state_dict(), fp) # Add progress bar showing batch loss value ProgressBar().attach(trainer, output_transform=lambda x: {"batch loss": x}) return trainer
def create_evaluator(model, metrics, config, tag="val"): with_amp = config["with_amp"] device = idist.device() @torch.no_grad() def evaluate_step(engine: Engine, batch): model.eval() x, y = batch[0], batch[1] if x.device != device: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) with autocast(enabled=with_amp): output = model(x) return output, y evaluator = Engine(evaluate_step) for name, metric in metrics.items(): metric.attach(evaluator, name) if idist.get_rank() == 0 and (not config["with_clearml"]): common.ProgressBar(desc=f"Evaluation ({tag})", persist=False).attach(evaluator) return evaluator
def _test_distrib_compute(device, tol=1e-6): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) m = R2Score(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10, ), device=device).float() y = torch.randint(0, 10, size=(10, ), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() assert r2_score(np_y, np_y_pred) == pytest.approx(res, abs=tol) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def __call__(self, checkpoint: Mapping, filename: str, metadata: Optional[Mapping] = None) -> None: super(TrainsSaver, self).__call__(checkpoint, filename, metadata) if idist.get_rank() == 0: # Maybe wont work with XLA if self._atomic: try: import trains except ImportError: raise RuntimeError( "This contrib module requires trains to be installed. " "You may install trains using: \n pip install trains \n" ) # If atomic, DiskSaver's implementation first stores checkpoint into a temporary file # and prohibits trains to automatically detect correct artifact path and name path = os.path.join(self.dirname, filename) if os.path.exists(path): trains.binding.frameworks.WeightsFileHandler.create_output_model( model=checkpoint, saved_path=path, framework=trains.model.Framework.pytorch, task=self._task, singlefile=True, model_name=os.path.basename(filename), )
def _test_distrib_integration(device, tol=1e-6): import numpy as np from ignite.engine import Engine rank = idist.get_rank() n_iters = 100 s = 10 offset = n_iters * s y_true = torch.arange(0, offset * idist.get_world_size(), dtype=torch.float).to(device) y_preds = torch.ones(offset * idist.get_world_size(), dtype=torch.float).to(device) def update(engine, i): return ( y_preds[i * s + offset * rank:(i + 1) * s + offset * rank], y_true[i * s + offset * rank:(i + 1) * s + offset * rank], ) engine = Engine(update) m = MeanSquaredError() m.attach(engine, "mse") data = list(range(n_iters)) engine.run(data=data, max_epochs=1) assert "mse" in engine.state.metrics res = engine.state.metrics["mse"] true_res = np.mean(np.power((y_true - y_preds).cpu().numpy(), 2.0)) assert pytest.approx(res, rel=tol) == true_res
def _test_distrib_compute(device): rank = idist.get_rank() def _test(metric_device): metric_device = torch.device(metric_device) m = WaveHedgesDistance(device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 10, size=(10, ), device=device).float() y = torch.randint(0, 10, size=(10, ), device=device).float() m.update((y_pred, y)) # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = y_pred.cpu().numpy() np_y = y.cpu().numpy() res = m.compute() np_sum = (np.abs(np_y - np_y_pred) / (np.maximum.reduce([np_y_pred, np_y]) + 1e-30)).sum() assert np_sum == pytest.approx(res) for _ in range(3): _test("cpu") if device.type != "xla": _test(idist.device())
def _test_idist_methods_overhead(ok_factor): import time n = 100000 m = 5 t2 = 0.0 t1 = 0.0 for _ in range(m): start = time.time() for _ in range(n): _ = dist.get_world_size() _ = dist.get_rank() elapsed = time.time() - start t2 += elapsed / n / m start = time.time() for _ in range(n): _ = idist.get_world_size() _ = idist.get_rank() elapsed = time.time() - start t1 += elapsed / n / m overhead_factor = t1 / t2 assert overhead_factor < ok_factor, f"{overhead_factor} vs {ok_factor} | {t2} vs {t1}"