def test_result_reduce_horovod(enable_pl_optimizer, tmpdir):
    """Make sure result logging works with Horovod.

    This test mirrors tests/core/test_results.py::_ddp_test_fn
    """
    tutils.reset_seed()
    tutils.set_random_master_port()

    def hvd_test_fn():
        path_here = os.path.abspath(os.path.dirname(__file__))
        path_root = os.path.abspath(os.path.join(path_here, '..', '..'))
        sys.path.insert(0, os.path.abspath(path_root))

        import horovod.torch as hvd

        from tests.base.boring_model import BoringModel

        class TestModel(BoringModel):
            def training_step(self, batch, batch_idx):
                self.training_step_called = True

                tensor = torch.tensor([1.0])
                self.log("test_tensor",
                         tensor,
                         sync_dist=True,
                         sync_dist_op='sum',
                         on_step=True,
                         on_epoch=True)

                res = self._results

                # Check that `tensor` is summed across all ranks automatically
                assert res["test_tensor"].item() == hvd.size(), \
                    "Result-Log does not work properly with Horovod and Tensors"

            def training_epoch_end(self, outputs) -> None:
                assert len(outputs) == 0

        model = TestModel()
        model.val_dataloader = None

        trainer = Trainer(
            default_root_dir=tmpdir,
            limit_train_batches=2,
            limit_val_batches=2,
            max_epochs=1,
            log_every_n_steps=1,
            weights_summary=None,
            enable_pl_optimizer=enable_pl_optimizer,
        )

        trainer.fit(model)

    horovod.run(hvd_test_fn, np=2)
def test_accuracy_metric_horovod():
    num_batches = 10
    batch_size = 16
    threshold = 0.5

    def sk_metric(preds, target):
        sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8)
        sk_target = target.view(-1).numpy()
        return accuracy_score(y_true=sk_target, y_pred=sk_preds)

    preds = torch.rand(num_batches, batch_size)
    target = torch.randint(high=2, size=(num_batches, batch_size))

    def _compute_batch():
        import horovod.torch as hvd

        trainer = Trainer(
            fast_dev_run=True,
            distributed_backend='horovod',
        )

        accelerator_backend = trainer.accelerator_connector.select_accelerator(
        )
        assert isinstance(accelerator_backend, HorovodAccelerator)

        metric = Accuracy(compute_on_step=True,
                          dist_sync_on_step=True,
                          dist_sync_fn=accelerator_backend.gather_all_tensors,
                          threshold=threshold)

        for i in range(hvd.rank(), num_batches, hvd.size()):
            batch_result = metric(preds[i], target[i])
            if hvd.rank() == 0:
                dist_preds = torch.stack(
                    [preds[i + r] for r in range(hvd.size())])
                dist_target = torch.stack(
                    [target[i + r] for r in range(hvd.size())])
                sk_batch_result = sk_metric(dist_preds, dist_target)
                assert np.allclose(batch_result.numpy(), sk_batch_result)

        # check on all batches on all ranks
        result = metric.compute()
        assert isinstance(result, torch.Tensor)

        total_preds = torch.stack([preds[i] for i in range(num_batches)])
        total_target = torch.stack([target[i] for i in range(num_batches)])
        sk_result = sk_metric(total_preds, total_target)

        assert np.allclose(result.numpy(), sk_result)

    horovod.run(_compute_batch, np=2)
Exemple #3
0
def test_accuracy_metric_horovod():
    num_batches = 10
    batch_size = 16
    threshold = 0.5

    def sk_metric(preds, target):
        sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8)
        sk_target = target.view(-1).numpy()
        return accuracy_score(y_true=sk_target, y_pred=sk_preds)

    preds = torch.rand(num_batches, batch_size)
    target = torch.randint(high=2, size=(num_batches, batch_size))

    def _compute_batch():
        trainer = Trainer(
            fast_dev_run=True,
            accelerator='horovod',
        )

        assert isinstance(trainer.accelerator, CPUAccelerator)
        # TODO: test that we selected the correct training_type_plugin based on horovod flags

        metric = Accuracy(
            compute_on_step=True,
            dist_sync_on_step=True,
            dist_sync_fn=trainer.training_type_plugin.gather_all_tensors,
            threshold=threshold)

        for i in range(hvd.rank(), num_batches, hvd.size()):
            batch_result = metric(preds[i], target[i])
            if hvd.rank() == 0:
                dist_preds = torch.stack(
                    [preds[i + r] for r in range(hvd.size())])
                dist_target = torch.stack(
                    [target[i + r] for r in range(hvd.size())])
                sk_batch_result = sk_metric(dist_preds, dist_target)
                assert np.allclose(batch_result.numpy(), sk_batch_result)

        # check on all batches on all ranks
        result = metric.compute()
        assert isinstance(result, torch.Tensor)

        total_preds = torch.stack([preds[i] for i in range(num_batches)])
        total_target = torch.stack([target[i] for i in range(num_batches)])
        sk_result = sk_metric(total_preds, total_target)

        assert np.allclose(result.numpy(), sk_result)

    horovod.run(_compute_batch, np=2)
Exemple #4
0
 def test_run_with_hosts(self):
     """Tests two usable hosts, two slots each in standard happy path."""
     hosts = 'localhost:2,127.0.0.1:2'
     results = horovod.run(train,
                           num_proc=2,
                           min_num_proc=2,
                           max_num_proc=2,
                           hosts=hosts)
     self.assertEqual([(0, 2), (1, 2)], results)
Exemple #5
0
    def test_run_with_discovery_script(self):
        """Tests two usable hosts, two slots each via discovery script in standard happy path."""
        with NamedTemporaryFile(mode='w') as script:
            script.write('echo "localhost:2"\n')
            script.write('echo "127.0.0.1:2"\n')
            script.file.close()
            os.chmod(script.name, 0o700)

            results = horovod.run(train,
                                  num_proc=2,
                                  min_num_proc=2,
                                  max_num_proc=2,
                                  host_discovery_script=script.name)

        self.assertEqual([(0, 2), (1, 2)], results)
Exemple #6
0
    train(state)

    checkpoint_dir = './checkpoints'
    checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting it.
    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)


if __name__ == '__main__':
    if len(sys.argv) == 5:
        # run training through horovod.run
        num_proc = int(sys.argv[1])
        min_num_proc = int(sys.argv[2])
        max_num_proc = int(sys.argv[3])
        hosts = sys.argv[4]
        print('Running training through horovod.run')
        horovod.run(main,
                    num_proc=num_proc,
                    min_num_proc=min_num_proc,
                    max_num_proc=max_num_proc,
                    hosts=hosts,
                    use_gloo=True,
                    verbose=2)
    else:
        # this is running via horovodrun
        main()
        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

    # Horovod: write logs on worker 0.
    verbose = 1 if hvd.rank() == 0 else 0

    # Train the model.
    # Horovod: adjust number of steps based on number of GPUs.
    mnist_model.fit(dataset, steps_per_epoch=500 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose)


if __name__ == '__main__':
    if len(sys.argv) == 4:
        # run training through horovod.run
        np = int(sys.argv[1])
        hosts = sys.argv[2]
        comm = sys.argv[3]
        print('Running training through horovod.run')
        horovod.run(main, np=np, hosts=hosts, use_gloo=comm == 'gloo', use_mpi=comm == 'mpi')
    else:
        # this is running via horovodrun
        main()
Exemple #8
0
    if args.use_mixed_precision:
        # Initialize scaler in global scale
        scaler = torch.cuda.amp.GradScaler()

    for epoch in range(1, args.epochs + 1):
        if args.use_mixed_precision:
            train_mixed_precision(epoch, scaler)
        else:
            train_epoch(epoch)
        # Keep test in full precision since computation is relatively light.
        test()


if __name__ == '__main__':
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    if args.num_proc:
        # run training through horovod.run
        print('Running training through horovod.run')
        horovod.run(main,
                    args=(args, ),
                    np=args.num_proc,
                    hosts=args.hosts,
                    use_gloo=args.communication == 'gloo',
                    use_mpi=args.communication == 'mpi')
    else:
        # this is running via horovodrun
        main(args)
Exemple #9
0
            _train_step(batch_idx, data, target)

    # Specific hvd
    hvd.shutdown()


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Torch Native - Horovod")
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument("--nproc_per_node", type=int, default=2)
    parser.add_argument("--log_interval", type=int, default=4)
    parser.add_argument("--nb_samples", type=int, default=128)
    parser.add_argument("--batch_size", type=int, default=16)
    args_parsed = parser.parse_args()

    args_parsed.cuda = not args_parsed.no_cuda and torch.cuda.is_available()

    config = {
        "log_interval": args_parsed.log_interval,
        "batch_size": args_parsed.batch_size,
        "nb_samples": args_parsed.nb_samples,
    }

    args = (args_parsed.nproc_per_node, args_parsed.cuda, config)

    # Specific hvd
    run(training, args=args, use_gloo=True, np=args_parsed.nproc_per_node)