Example #1
0
def _test_distrib_compute(device):

    rank = idist.get_rank()
    torch.manual_seed(12)

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = GeometricMeanRelativeAbsoluteError(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.rand(size=(100,), device=device)
        y = torch.rand(size=(100,), device=device)

        m.update((y_pred, y))

        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y = y.cpu().numpy()
        np_y_pred = y_pred.cpu().numpy()

        np_gmrae = np.exp(np.log(np.abs(np_y - np_y_pred) / np.abs(np_y - np_y.mean())).mean())

        assert m.compute() == pytest.approx(np_gmrae, rel=1e-4)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #2
0
    def __init__(self,
                 logger: TrainsLogger = None,
                 output_uri: str = None,
                 dirname: str = None,
                 *args,
                 **kwargs):

        self._setup_check_trains(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(
                    prefix="ignite_checkpoints_{}".format(
                        datetime.now().strftime("%Y_%m_%d_%H_%M_%S_")))
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[0]

            warnings.warn(
                "TrainsSaver created a temporary checkpoints directory: {}".
                format(dirname))
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)
Example #3
0
def _test_distrib_compute(device):
    rank = idist.get_rank()

    manhattan = DistanceMetric.get_metric("manhattan")

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = ManhattanDistance(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10,), device=device).float()
        y = torch.randint(0, 10, size=(10,), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()
        res = m.compute()
        assert manhattan.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(res)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #4
0
def test_no_distrib(capsys):

    assert idist.backend() is None
    if torch.cuda.is_available():
        assert idist.device().type == "cuda"
    else:
        assert idist.device().type == "cpu"
    assert idist.get_rank() == 0
    assert idist.get_world_size() == 1
    assert idist.get_local_rank() == 0
    assert idist.model_name() == "serial"

    from ignite.distributed.utils import _model, _SerialModel

    _sanity_check()
    assert isinstance(_model, _SerialModel)

    idist.show_config()
    captured = capsys.readouterr()
    out = captured.err.split("\r")
    out = list(map(lambda x: x.strip(), out))
    out = list(filter(None, out))
    assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[
        -1]
    assert "ignite.distributed.utils INFO: backend: None" in out[-1]
    if torch.cuda.is_available():
        assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
    else:
        assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
    assert "ignite.distributed.utils INFO: rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: local rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
    def __init__(
        self,
        logger: Optional[ClearMLLogger] = None,
        output_uri: Optional[str] = None,
        dirname: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:

        self._setup_check_clearml(logger, output_uri)

        if not dirname:
            dirname = ""
            if idist.get_rank() == 0:
                dirname = tempfile.mkdtemp(prefix=f"ignite_checkpoints_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')}")
            if idist.get_world_size() > 1:
                dirname = idist.all_gather(dirname)[0]  # type: ignore[index, assignment]

            warnings.warn(f"ClearMLSaver created a temporary checkpoints directory: {dirname}")
            idist.barrier()

        # Let's set non-atomic tmp dir saving behaviour
        if "atomic" not in kwargs:
            kwargs["atomic"] = False

        self._checkpoint_slots = defaultdict(list)  # type: DefaultDict[Union[str, Tuple[str, str]], List[Any]]

        super(ClearMLSaver, self).__init__(dirname=dirname, *args, **kwargs)  # type: ignore[misc]
Example #6
0
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None):
    assert idist.backend() == backend, f"{idist.backend()} vs {backend}"

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend in ("nccl", "horovod") and "cuda" in this_device.type:
        true_device = torch.device(f"{true_device}:{local_rank}")
        assert this_device == true_device, f"{this_device} vs {true_device}"
    elif backend in ("gloo", "horovod"):
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()

    if rank is not None:
        assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist")

    _sanity_check()
Example #7
0
def run_evaluation(config_filepath, backend="nccl", with_clearml=True):
    """Main entry to run model's evaluation:
        - compute validation metrics

    Args:
        config_filepath (str): evaluation configuration .py file
        backend (str): distributed backend: nccl, gloo, horovod or None to run without distributed config
        with_clearml (bool): if True, uses ClearML as experiment tracking system
    """
    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled
    torch.backends.cudnn.benchmark = True

    config_filepath = Path(config_filepath)
    assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found"

    with idist.Parallel(backend=backend) as parallel:
        logger = setup_logger(name="Pascal-VOC12 Evaluation", distributed_rank=idist.get_rank())

        config = ConfigObject(config_filepath)
        InferenceConfigSchema.validate(config)
        config.script_filepath = Path(__file__)

        output_path = setup_experiment_tracking(config, with_clearml=with_clearml, task_type="testing")
        config.output_path = output_path

        utils.log_basic_info(logger, get_params(config, InferenceConfigSchema))

        try:
            parallel.run(evaluation, config, logger=logger, with_clearml=with_clearml)
        except KeyboardInterrupt:
            logger.info("Catched KeyboardInterrupt -> exit")
        except Exception as e:  # noqa
            logger.exception("")
            raise e
Example #8
0
def _test_idist_methods_overhead(ok_factor, sync_model):
    import time
    import horovod.torch as hvd

    if sync_model:
        idist.sync()
        from ignite.distributed.utils import _model
        from ignite.distributed.comp_models.horovod import _HorovodDistModel

        assert isinstance(_model, _HorovodDistModel)

    n = 100000
    m = 5

    t2 = 0.0
    t1 = 0.0
    for j in range(m):
        start = time.time()
        for _ in range(n):
            _ = hvd.size()
            _ = hvd.rank()
        elapsed = time.time() - start
        t2 += elapsed / n / m

        start = time.time()
        for _ in range(n):
            _ = idist.get_world_size()
            _ = idist.get_rank()
        elapsed = time.time() - start
        t1 += elapsed / n / m

    overhead_factor = t1 / t2
    assert overhead_factor < ok_factor, "{} vs {} | {} vs {}".format(
        overhead_factor, ok_factor, t2, t1)
def _test_distrib_compute(device):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = MaximumAbsoluteError(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10,), device=device).float()
        y = torch.randint(0, 10, size=(10,), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()

        res = m.compute()

        np_max = np.max(np.abs((np_y_pred - np_y)))

        assert np_max == pytest.approx(res)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #10
0
def create_trainer(model, optimizer, criterion, train_sampler, config, logger):
    prepare_batch = config.prepare_batch
    device = config.device

    # Setup trainer
    accumulation_steps = getattr(config, "accumulation_steps", 1)
    model_output_transform = getattr(config, "model_output_transform", lambda x: x)

    def train_update_function(engine, batch):

        model.train()

        x, y = prepare_batch(batch, device=device, non_blocking=True)
        y_pred = model(x)
        y_pred = model_output_transform(y_pred)
        loss = criterion(y_pred, y)

        if isinstance(loss, Mapping):
            assert "supervised batch loss" in loss
            loss_dict = loss
            output = {k: v.item() for k, v in loss_dict.items()}
            loss = loss_dict["supervised batch loss"] / accumulation_steps
        else:
            output = {"supervised batch loss": loss.item()}

        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()

        if engine.state.iteration % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        return output

    output_names = getattr(config, "output_names", ["supervised batch loss",])
    lr_scheduler = config.lr_scheduler

    trainer = Engine(train_update_function)
    trainer.logger = logger

    to_save = {"model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, "trainer": trainer, "amp": amp}

    save_every_iters = getattr(config, "save_every_iters", 1000)

    common.setup_common_training_handlers(
        trainer,
        train_sampler,
        to_save=to_save,
        save_every_iters=save_every_iters,
        save_handler=get_save_handler(config),
        lr_scheduler=lr_scheduler,
        with_gpu_stats=exp_tracking.has_mlflow,
        output_names=output_names,
        with_pbars=False,
    )

    if idist.get_rank() == 0:
        common.ProgressBar(persist=False).attach(trainer, metric_names="all")

    return trainer
Example #11
0
def _test_distrib_metrics_on_diff_devices(device):
    n_classes = 10
    n_iters = 12
    s = 16
    offset = n_iters * s
    rank = idist.get_rank()

    y_true = torch.randint(0,
                           n_classes,
                           size=(offset * idist.get_world_size(), )).to(device)
    y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device)

    def update(engine, i):
        return (
            y_preds[i * s + rank * offset:(i + 1) * s + rank * offset],
            y_true[i * s + rank * offset:(i + 1) * s + rank * offset],
        )

    precision = Precision(average=False, device="cpu")
    recall = Recall(average=False, device=device)
    custom_metric = precision * recall

    engine = Engine(update)
    custom_metric.attach(engine, "custom_metric")

    data = list(range(n_iters))
    engine.run(data, max_epochs=2)
Example #12
0
def _test_distrib_binary_input_N(device):

    rank = idist.get_rank()
    torch.manual_seed(12)

    def _test(y_pred, y, batch_size, metric_device):
        metric_device = torch.device(metric_device)
        roc_auc = ROC_AUC(device=metric_device)

        torch.manual_seed(10 + rank)

        roc_auc.reset()
        roc_auc.update((y_pred, y))

        if batch_size > 1:
            n_iters = y.shape[0] // batch_size + 1
            for i in range(n_iters):
                idx = i * batch_size
                roc_auc.update(
                    (y_pred[idx:idx + batch_size], y[idx:idx + batch_size]))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y = y.cpu().numpy()
        np_y_pred = y_pred.cpu().numpy()

        res = roc_auc.compute()
        assert isinstance(res, float)
        assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res)

    def get_test_cases():
        test_cases = [
            (torch.randint(0, 2, size=(10, )).long(),
             torch.randint(0, 2, size=(10, )).long(), 1),
            (torch.randint(0, 2, size=(100, )).long(),
             torch.randint(0, 2, size=(100, )).long(), 1),
            (torch.randint(0, 2, size=(10, 1)).long(),
             torch.randint(0, 2, size=(10, 1)).long(), 1),
            (torch.randint(0, 2, size=(100, 1)).long(),
             torch.randint(0, 2, size=(100, 1)).long(), 1),
            # updated batches
            (torch.randint(0, 2, size=(10, )).long(),
             torch.randint(0, 2, size=(10, )).long(), 16),
            (torch.randint(0, 2, size=(100, )).long(),
             torch.randint(0, 2, size=(100, )).long(), 16),
            (torch.randint(0, 2, size=(10, 1)).long(),
             torch.randint(0, 2, size=(10, 1)).long(), 16),
            (torch.randint(0, 2, size=(100, 1)).long(),
             torch.randint(0, 2, size=(100, 1)).long(), 16),
        ]
        return test_cases

    for _ in range(3):
        test_cases = get_test_cases()
        for y_pred, y, batch_size in test_cases:
            _test(y_pred, y, batch_size, "cpu")
            if device.type != "xla":
                _test(y_pred, y, batch_size, idist.device())
Example #13
0
def _test_distrib_integration(device):

    from ignite.engine import Engine

    rank = idist.get_rank()
    torch.manual_seed(12)

    def _test(metric_device):
        n_iters = 60
        s = 16
        offset = n_iters * s

        n_probabilities = 10
        y = torch.rand(offset * idist.get_world_size(), n_probabilities)

        def update(_, i):
            return y[i * s + rank * offset:(i + 1) * s + rank * offset, :]

        engine = Engine(update)
        m = InceptionScore(num_features=n_probabilities,
                           feature_extractor=torch.nn.Identity(),
                           device=metric_device)
        m.attach(engine, "InceptionScore")

        engine.run(data=list(range(n_iters)), max_epochs=1)

        assert "InceptionScore" in engine.state.metrics

        assert pytest.approx(calculate_inception_score(y)) == m.compute()

    metric_devices = [torch.device("cpu")]
    if device.type != "xla":
        metric_devices.append(idist.device())
    for metric_device in metric_devices:
        _test(metric_device=metric_device)
Example #14
0
def _test_distrib_compute(device):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = MeanNormalizedBias(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(1, 11, size=(10, ), device=device).float()
        y = torch.randint(1, 11, size=(10, ), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()

        res = m.compute()

        np_sum = ((np_y - np_y_pred) / np_y).sum()
        np_len = len(np_y_pred)
        np_ans = np_sum / np_len

        assert np_ans == pytest.approx(res)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #15
0
def create_evaluator(model, metrics, config, tag="val"):
    with_amp = config["with_amp"]
    device = idist.device()

    @torch.no_grad()
    def evaluate_step(engine, batch):
        model.eval()

        input_batch = batch[0]
        labels = batch[1].view(-1, 1)

        if labels.device != device:
            input_batch = {
                k: v.to(device, non_blocking=True, dtype=torch.long)
                for k, v in batch[0].items()
            }
            labels = labels.to(device, non_blocking=True, dtype=torch.float)

        with autocast(enabled=with_amp):
            output = model(input_batch)
        return output, labels

    evaluator = Engine(evaluate_step)

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if idist.get_rank() == 0 and (not config["with_clearml"]):
        common.ProgressBar(desc=f"Evaluation ({tag})",
                           persist=False).attach(evaluator)

    return evaluator
Example #16
0
def _test_idist_methods_overhead(ok_factor):
    import time

    n = 100000
    m = 5

    t2 = 0.0
    t1 = 0.0
    for j in range(m):
        start = time.time()
        for _ in range(n):
            _ = dist.get_world_size()
            _ = dist.get_rank()
        elapsed = time.time() - start
        t2 += elapsed / n / m

        start = time.time()
        for _ in range(n):
            _ = idist.get_world_size()
            _ = idist.get_rank()
        elapsed = time.time() - start
        t1 += elapsed / n / m

    overhead_factor = t1 / t2
    assert overhead_factor < ok_factor, "{} vs {} | {} vs {}".format(
        overhead_factor, ok_factor, t2, t1)
def _test_distrib_compute(device):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        ck_metric = CohenKappa(device=metric_device)

        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 2, size=(100, 1), device=device)
        y = torch.randint(0, 2, size=(100, 1), device=device)

        ck_metric.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()

        np_ck = cohen_kappa_score(np_y, np_y_pred)

        res = ck_metric.compute()
        assert res == pytest.approx(np_ck)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #18
0
    def __init__(self, num_iters=100, prepare_batch=None):

        from ignite.handlers import Timer

        device = idist.device()

        def upload_to_gpu(engine, batch):
            if prepare_batch is not None:
                x, y = prepare_batch(batch, device=device, non_blocking=False)

        self.num_iters = num_iters
        self.benchmark_dataflow = Engine(upload_to_gpu)

        @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(once=num_iters))
        def stop_benchmark_dataflow(engine):
            engine.terminate()

        if idist.get_rank() == 0:

            @self.benchmark_dataflow.on(
                Events.ITERATION_COMPLETED(every=num_iters // 100))
            def show_progress_benchmark_dataflow(engine):
                print(".", end=" ")

        self.timer = Timer(average=False)
        self.timer.attach(
            self.benchmark_dataflow,
            start=Events.EPOCH_STARTED,
            resume=Events.ITERATION_STARTED,
            pause=Events.ITERATION_COMPLETED,
            step=Events.ITERATION_COMPLETED,
        )
Example #19
0
def setup_experiment_tracking(config, with_clearml, task_type="training"):
    from datetime import datetime

    assert task_type in ("training", "testing"), task_type

    output_path = ""
    if idist.get_rank() == 0:
        if with_clearml:
            from clearml import Task

            schema = TrainvalConfigSchema if task_type == "training" else InferenceConfigSchema

            task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem, task_type=task_type)
            task.connect_configuration(config.config_filepath.as_posix())

            task.upload_artifact(config.script_filepath.name, config.script_filepath.as_posix())
            task.upload_artifact(config.config_filepath.name, config.config_filepath.as_posix())
            task.connect(get_params(config, schema))

            output_path = Path(os.environ.get("CLEARML_OUTPUT_PATH", "/tmp"))
            output_path = output_path / "clearml" / datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            import shutil

            output_path = Path(os.environ.get("OUTPUT_PATH", "/tmp/output-pascal-voc12"))
            output_path = output_path / task_type / config.config_filepath.stem
            output_path = output_path / datetime.now().strftime("%Y%m%d-%H%M%S")
            output_path.mkdir(parents=True, exist_ok=True)

            shutil.copyfile(config.script_filepath.as_posix(), output_path / config.script_filepath.name)
            shutil.copyfile(config.config_filepath.as_posix(), output_path / config.config_filepath.name)

        output_path = output_path.as_posix()
    return Path(idist.broadcast(output_path, src=0))
Example #20
0
def _test_distrib_log_lr_and_loss(device):
    from ignite.handlers import ParamScheduler

    lr_finder = FastaiLRFinder()
    _lr_schedule = MagicMock(spec=ParamScheduler)

    # minimal setup for lr_finder to make _log_lr_and_loss work
    rank = idist.get_rank()
    loss = 0.01 * (rank + 1)

    engine = Engine(lambda e, b: None)

    engine.state.output = loss
    engine.state.iteration = 1
    lr_finder._lr_schedule = _lr_schedule
    lr_finder._history["loss"] = []
    lr_finder._history["lr"] = []

    lr_finder._log_lr_and_loss(engine,
                               output_transform=lambda x: x,
                               smooth_f=0.1,
                               diverge_th=10.0)

    expected_loss = idist.all_reduce(loss)
    assert pytest.approx(lr_finder._history["loss"][-1]) == expected_loss
Example #21
0
    def compute(self) -> float:
        if len(self._predictions) < 1 or len(self._targets) < 1:
            raise NotComputableError(
                "EpochMetric must have at least one example before it can be computed."
            )

        _prediction_tensor = torch.cat(self._predictions, dim=0)
        _target_tensor = torch.cat(self._targets, dim=0)

        ws = idist.get_world_size()

        if ws > 1 and not self._is_reduced:
            # All gather across all processes
            _prediction_tensor = cast(torch.Tensor,
                                      idist.all_gather(_prediction_tensor))
            _target_tensor = cast(torch.Tensor,
                                  idist.all_gather(_target_tensor))
        self._is_reduced = True

        result = 0.0
        if idist.get_rank() == 0:
            # Run compute_fn on zero rank only
            result = self.compute_fn(_prediction_tensor, _target_tensor)

        if ws > 1:
            # broadcast result to all processes
            result = cast(float, idist.broadcast(result, src=0))

        return result
Example #22
0
def _test_distrib_compute(device, tol=1e-5):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = FractionalBias(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10,), device=device).float()
        y = torch.randint(0, 10, size=(10,), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()

        res = m.compute()

        np_sum = (2 * (np_y - np_y_pred) / (np_y_pred + np_y + 1e-30)).sum()
        np_len = len(y_pred)
        np_ans = np_sum / np_len

        assert np_ans == pytest.approx(res, rel=tol)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #23
0
def create_evaluators(model, metrics, config):
    model_output_transform = getattr(config, "model_output_transform",
                                     lambda x: x)

    evaluator_args = dict(
        model=model,
        metrics=metrics,
        device=config.device,
        non_blocking=True,
        prepare_batch=config.prepare_batch,
        output_transform=lambda x, y, y_pred: (
            model_output_transform(y_pred),
            y,
        ),
    )
    train_evaluator = create_supervised_evaluator(**evaluator_args)
    evaluator = create_supervised_evaluator(**evaluator_args)

    if idist.get_rank() == 0:
        common.ProgressBar(desc="Evaluation (train)",
                           persist=False).attach(train_evaluator)
        common.ProgressBar(desc="Evaluation (val)",
                           persist=False).attach(evaluator)

    return evaluator, train_evaluator
Example #24
0
def create_trainer(model, optimizer, criterion, lr_scheduler, config):

    # Define any training logic for iteration update
    def train_step(engine, batch):
        x, y = batch[0].to(idist.device()), batch[1].to(idist.device())

        model.train()
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        return loss.item()

    # Define trainer engine
    trainer = Engine(train_step)

    if idist.get_rank() == 0:
        # Add any custom handlers
        @trainer.on(Events.ITERATION_COMPLETED(every=200))
        def save_checkpoint():
            fp = Path(config.get("output_path", "output")) / "checkpoint.pt"
            torch.save(model.state_dict(), fp)

        # Add progress bar showing batch loss value
        ProgressBar().attach(trainer,
                             output_transform=lambda x: {"batch loss": x})

    return trainer
Example #25
0
def create_evaluator(model, metrics, config, tag="val"):
    with_amp = config["with_amp"]
    device = idist.device()

    @torch.no_grad()
    def evaluate_step(engine: Engine, batch):
        model.eval()
        x, y = batch[0], batch[1]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        with autocast(enabled=with_amp):
            output = model(x)
        return output, y

    evaluator = Engine(evaluate_step)

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    if idist.get_rank() == 0 and (not config["with_clearml"]):
        common.ProgressBar(desc=f"Evaluation ({tag})", persist=False).attach(evaluator)

    return evaluator
Example #26
0
def _test_distrib_compute(device, tol=1e-6):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = R2Score(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10, ), device=device).float()
        y = torch.randint(0, 10, size=(10, ), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()
        res = m.compute()
        assert r2_score(np_y, np_y_pred) == pytest.approx(res, abs=tol)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #27
0
    def __call__(self,
                 checkpoint: Mapping,
                 filename: str,
                 metadata: Optional[Mapping] = None) -> None:
        super(TrainsSaver, self).__call__(checkpoint, filename, metadata)

        if idist.get_rank() == 0:
            # Maybe wont work with XLA
            if self._atomic:
                try:
                    import trains
                except ImportError:
                    raise RuntimeError(
                        "This contrib module requires trains to be installed. "
                        "You may install trains using: \n pip install trains \n"
                    )

                # If atomic, DiskSaver's implementation first stores checkpoint into a temporary file
                # and prohibits trains to automatically detect correct artifact path and name
                path = os.path.join(self.dirname, filename)
                if os.path.exists(path):
                    trains.binding.frameworks.WeightsFileHandler.create_output_model(
                        model=checkpoint,
                        saved_path=path,
                        framework=trains.model.Framework.pytorch,
                        task=self._task,
                        singlefile=True,
                        model_name=os.path.basename(filename),
                    )
Example #28
0
def _test_distrib_integration(device, tol=1e-6):
    import numpy as np
    from ignite.engine import Engine

    rank = idist.get_rank()
    n_iters = 100
    s = 10
    offset = n_iters * s

    y_true = torch.arange(0,
                          offset * idist.get_world_size(),
                          dtype=torch.float).to(device)
    y_preds = torch.ones(offset * idist.get_world_size(),
                         dtype=torch.float).to(device)

    def update(engine, i):
        return (
            y_preds[i * s + offset * rank:(i + 1) * s + offset * rank],
            y_true[i * s + offset * rank:(i + 1) * s + offset * rank],
        )

    engine = Engine(update)

    m = MeanSquaredError()
    m.attach(engine, "mse")

    data = list(range(n_iters))
    engine.run(data=data, max_epochs=1)

    assert "mse" in engine.state.metrics
    res = engine.state.metrics["mse"]

    true_res = np.mean(np.power((y_true - y_preds).cpu().numpy(), 2.0))

    assert pytest.approx(res, rel=tol) == true_res
def _test_distrib_compute(device):
    rank = idist.get_rank()

    def _test(metric_device):
        metric_device = torch.device(metric_device)
        m = WaveHedgesDistance(device=metric_device)
        torch.manual_seed(10 + rank)

        y_pred = torch.randint(0, 10, size=(10, ), device=device).float()
        y = torch.randint(0, 10, size=(10, ), device=device).float()

        m.update((y_pred, y))

        # gather y_pred, y
        y_pred = idist.all_gather(y_pred)
        y = idist.all_gather(y)

        np_y_pred = y_pred.cpu().numpy()
        np_y = y.cpu().numpy()

        res = m.compute()

        np_sum = (np.abs(np_y - np_y_pred) /
                  (np.maximum.reduce([np_y_pred, np_y]) + 1e-30)).sum()

        assert np_sum == pytest.approx(res)

    for _ in range(3):
        _test("cpu")
        if device.type != "xla":
            _test(idist.device())
Example #30
0
def _test_idist_methods_overhead(ok_factor):
    import time

    n = 100000
    m = 5

    t2 = 0.0
    t1 = 0.0
    for _ in range(m):
        start = time.time()
        for _ in range(n):
            _ = dist.get_world_size()
            _ = dist.get_rank()
        elapsed = time.time() - start
        t2 += elapsed / n / m

        start = time.time()
        for _ in range(n):
            _ = idist.get_world_size()
            _ = idist.get_rank()
        elapsed = time.time() - start
        t1 += elapsed / n / m

    overhead_factor = t1 / t2
    assert overhead_factor < ok_factor, f"{overhead_factor} vs {ok_factor} | {t2} vs {t1}"