Beispiel #1
0
def test_tf_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(tf_quick_start_train_func)
    trainer.shutdown()
Beispiel #2
0
def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker):
    @patch("torch.cuda.is_available", lambda: True)
    def train_fn():
        return train.torch.get_device().index

    trainer = Trainer(
        TorchConfig(backend="gloo"),
        num_workers=int(8 / num_gpus_per_worker),
        use_gpu=True,
        resources_per_worker={"GPU": num_gpus_per_worker},
    )
    trainer.start()
    devices = trainer.run(train_fn)
    trainer.shutdown()

    count = Counter(devices)
    if num_gpus_per_worker == 0.5:
        for i in range(4):
            assert count[i] == 4
    elif num_gpus_per_worker == 1:
        for i in range(4):
            assert count[i] == 2
    elif num_gpus_per_worker == 2:
        for i in range(2):
            assert count[2 * i] == 2
    else:
        raise RuntimeError(
            "New parameter for this test has been added without checking that the "
            "correct devices have been returned.")
Beispiel #3
0
def test_torch_auto_unwrap(ray_start_2_cpus):
    """Tests if underlying model from DDP is extracted when saving ckpt."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    class ValidateEncodedCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                model = result["model"]
                assert isinstance(model, torch.nn.Module) and not \
                    isinstance(model,
                               torch.nn.parallel.DistributedDataParallel)

    trainer.run(train_fn, callbacks=[ValidateEncodedCallback()])

    last_checkpoint = trainer.latest_checkpoint
    model = last_checkpoint["model"]
    assert isinstance(model, torch.nn.Module) and not \
        isinstance(model, torch.nn.parallel.DistributedDataParallel)

    trainer.shutdown()
Beispiel #4
0
def test_torch_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without torch DDP."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(torch_quick_start_train_func)
    trainer.shutdown()
Beispiel #5
0
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(
        train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    )
    trainer.shutdown()
    print(f"Results: {results[0]}")
Beispiel #6
0
 def latency(amp: bool) -> float:
     trainer = Trainer("torch", num_workers=2, use_gpu=True)
     trainer.start()
     start_time = timer()
     trainer.run(train_func, {"amp": amp})
     end_time = timer()
     trainer.shutdown()
     return end_time - start_time
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(
        train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]
    )
    trainer.shutdown()

    print(results)
    return results
Beispiel #8
0
def test_horovod_simple(ray_start_2_cpus):
    def simple_fn():
        hvd_torch.init()
        return hvd_torch.rank()

    num_workers = 2
    trainer = Trainer("horovod", num_workers)
    trainer.start()
    result = trainer.run(simple_fn)
    trainer.shutdown()

    assert result == list(range(num_workers))
Beispiel #9
0
def test_worker_kill_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        checkpoint = train.load_checkpoint()
        if checkpoint:
            epoch = checkpoint["epoch"]
        else:
            epoch = 0
        print("Epoch: ", epoch)
        for i in range(epoch, 2):
            train.report(loss=1, iter=i)
            train.save_checkpoint(epoch=i + 1)

    trainer = Trainer(test_config, num_workers=2)
    trainer.start()
    kill_callback = KillCallback(
        fail_on=0, worker_group=trainer._executor.worker_group)

    trainer.run(train_func, callbacks=[kill_callback])

    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint is saved.*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from beginning*
    # Run 2: epoch=0, counter=2, Successful
    # Run 3: epoch=1, counter=3, Successful
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(
        fail_on=1, worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint saved*
    # *Latest checkpoint updated, epoch=1
    # Run 2: epoch=1, counter=2, Successful
    # *Checkpoint saved*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from last checkpoint.*
    # Run 3: epoch=1, counter=3, Successful.
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    def train_func():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train_func)
Beispiel #10
0
def train_linear(num_workers=2):
    trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()

    print(results)
    return results
Beispiel #11
0
def test_run(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == 1 for result in results)
Beispiel #12
0
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
    """Test that model with AMP is serializable."""
    def train_func():
        train.torch.accelerate(amp=True)

        model = torchvision.models.resnet101()
        model = train.torch.prepare_model(model)

        train.save_checkpoint(model=model)

    trainer = Trainer("torch", num_workers=1, use_gpu=True)
    trainer.start()
    trainer.run(train_func)
    trainer.shutdown()
Beispiel #13
0
def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers, use_gpu=True)
    trainer.start()
    results = trainer.run(
        horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3}
    )
    trainer.shutdown()

    assert len(results) == num_workers
    for worker_result in results:
        assert len(worker_result) == num_epochs
        assert worker_result[num_epochs - 1] < worker_result[0]
Beispiel #14
0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    class CudaTestBackend(TestBackend):
        share_cuda_visible_devices = True

    class CudaTestConfig(TestConfig):
        @property
        def backend_cls(self):
            return CudaTestBackend

    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=False,
                resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=True,
                resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()
Beispiel #15
0
def test_torch_fashion_mnist(ray_start_2_cpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers)
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer.start()
    results = trainer.run(fashion_mnist_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1] < result[0]
Beispiel #16
0
def test_torch_linear(ray_start_2_cpus, num_workers):
    num_workers = num_workers
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(linear_train_func, config)
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]
Beispiel #17
0
def test_torch_amp(ray_start_2_cpus):
    def train_fn():
        train.torch.accelerate(amp=True)
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)

        # Make sure model is serializable even with amp enabled.
        return model.module

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    trainer.run(train_fn)
    trainer.shutdown()
Beispiel #18
0
def test_run_config(ray_start_2_cpus):
    backend_config = TestConfig()

    def train_func(config):
        return config["fruit"]

    config = {"fruit": "banana"}

    trainer = Trainer(backend_config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func, config)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == "banana" for result in results)
Beispiel #19
0
def train_linear(num_workers=2, use_gpu=False):
    datasets = get_datasets()

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        dataset=datasets,
        callbacks=[JsonLoggerCallback(), TBXLoggerCallback()],
    )
    trainer.shutdown()
    print(results)
    return results
Beispiel #20
0
def test_torch_linear_failure(ray_start_4_cpus):
    num_workers = 2
    epochs = 3

    trainer = Trainer("torch", num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    kill_callback = KillCallback(fail_on=1, trainer=trainer)
    results = trainer.run(linear_train_func, config, callbacks=[kill_callback])
    trainer.shutdown()

    assert len(results) == num_workers

    for result in results:
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]
Beispiel #21
0
def test_start_shutdown(ray_start_2_cpus, num_workers):
    config = TestConfig()
    assert ray.available_resources()["CPU"] == 2
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    time.sleep(1)

    remaining = 2 - num_workers
    if remaining == 0:
        assert "CPU" not in ray.available_resources()
    else:
        assert ray.available_resources()["CPU"] == remaining

    trainer.shutdown()
    time.sleep(1)
    assert ray.available_resources()["CPU"] == 2
def train_tensorflow_linear(num_workers=2, use_gpu=False):
    dataset_pipeline = get_dataset_pipeline()
    trainer = Trainer(backend="tensorflow",
                      num_workers=num_workers,
                      use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(train_func=train_func,
                          dataset=dataset_pipeline,
                          config={
                              "lr": 1e-3,
                              "batch_size": 32,
                              "epochs": 4
                          })
    trainer.shutdown()
    print(f"Results: {results[0]}")
    return results
Beispiel #23
0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=False,
            resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()
Beispiel #24
0
def test_horovod_torch_mnist_stateful(ray_start_4_cpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers)
    workers = trainer.to_worker_group(HorovodTrainClass,
                                      config={
                                          "num_epochs": num_epochs,
                                          "lr": 1e-3
                                      })
    results = []
    for epoch in range(num_epochs):
        results.append(ray.get([w.train.remote(epoch=epoch) for w in workers]))
    trainer.shutdown()

    assert len(results) == num_epochs
    for i in range(num_workers):
        assert results[num_epochs - 1][i] < results[0][i]
def start_ray_train(config, num_workers=4, use_gpu=False):
    '''
    Train model using RayTrain.
    num_workers determines the number of processes.
    Uses the same config as local training.
    '''
    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()

    start_time = time.time()
    results = trainer.run(train_epochs_remote, config=config)

    duration = time.time() - start_time

    trainer.shutdown()

    return None, results, duration
Beispiel #26
0
def test_run_failure(ray_start_2_cpus):
    test_config = TestConfig()

    def train_invalid_signature(a, b):
        pass

    trainer = Trainer(test_config, num_workers=2)

    # Raise RuntimeError when trainer has not been started yet.
    with pytest.raises(RuntimeError):
        trainer.run(lambda: 1)

    trainer.start()

    with pytest.raises(ValueError):
        trainer.run(train_invalid_signature)

    trainer.shutdown()
Beispiel #27
0
def test_torch_prepare_model(ray_start_4_cpus_2_gpus):
    """Tests if ``prepare_model`` correctly wraps in DDP."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Make sure model is wrapped in DDP.
        assert isinstance(model, DistributedDataParallel)

        # Make sure model is on cuda.
        assert next(model.parameters()).is_cuda

    trainer = Trainer("torch", num_workers=2, use_gpu=True)
    trainer.start()
    trainer.run(train_fn)
    trainer.shutdown()
Beispiel #28
0
def test_worker_kill(ray_start_2_cpus, backend):
    if backend == "test":
        test_config = TestConfig()
    elif backend == "torch":
        test_config = TorchConfig()
    elif backend == "tf":
        test_config = TensorflowConfig()
    elif backend == "horovod":
        test_config = HorovodConfig()

    trainer = Trainer(test_config, num_workers=2)

    def train_func():
        for i in range(2):
            train.report(loss=1, iter=i)

    trainer.start()
    kill_callback = KillCallback(
        fail_on=0, worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning
    # Run 3: iter=0, counter=2, Successful
    # Run 4: iter=1, counter=3, Successful
    assert kill_callback.counter == 3

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(
        fail_on=1, worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: iter=0, counter=1, Successful
    # Run 2: iter=1, counter=2, Successful
    # Run 3: None, counter=2, Unsuccessful, starts training from beginning.
    # Run 4: iter=0, counter=3, Successful
    # Run 5: iter=1, counter=4, Successful
    assert kill_callback.counter == 4

    def train_func():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train_func)
Beispiel #29
0
def test_enable_reproducibility(ray_start_4_cpus_2_gpus, use_gpu):
    # NOTE: Reproducible results aren't guaranteed between seeded executions, even with
    # identical hardware and software dependencies. This test should be okay given that
    # it only runs for two epochs on a small dataset.
    # NOTE: I've chosen to use a ResNet model over a more simple model, because
    # `enable_reproducibility` disables CUDA convolution benchmarking, and a simpler
    # model (e.g., linear) might not test this feature.
    def train_func():
        train.torch.enable_reproducibility()

        model = torchvision.models.resnet18()
        model = train.torch.prepare_model(model)

        dataset_length = 128
        dataset = torch.utils.data.TensorDataset(
            torch.randn(dataset_length, 3, 32, 32),
            torch.randint(low=0, high=1000, size=(dataset_length, )),
        )
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=64)
        dataloader = train.torch.prepare_data_loader(dataloader)

        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

        model.train()
        for epoch in range(2):
            for images, targets in dataloader:
                optimizer.zero_grad()

                outputs = model(images)
                loss = torch.nn.functional.cross_entropy(outputs, targets)

                loss.backward()
                optimizer.step()

        return loss.item()

    trainer = Trainer("torch", num_workers=2, use_gpu=use_gpu)
    trainer.start()
    result1 = trainer.run(train_func)
    result2 = trainer.run(train_func)
    trainer.shutdown()

    assert result1 == result2
Beispiel #30
0
def test_resources(ray_start_4_cpus_4_gpus_4_extra, resource, num_requested):
    num_workers = 2
    config = TestConfig()
    original = ray.available_resources().get(resource)
    resources_per_worker = {resource: num_requested}
    use_gpu = resource == "GPU"
    trainer = Trainer(config,
                      num_workers=num_workers,
                      use_gpu=use_gpu,
                      resources_per_worker=resources_per_worker)

    trainer.start()
    expected = original - num_workers * num_requested
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == expected)

    trainer.shutdown()
    wait_for_condition(
        lambda: ray.available_resources().get(resource, 0) == original)