Example #1
0
def test_checkpoint(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        assert train.load_checkpoint() is None
        for i in range(3):
            train.save_checkpoint(epoch=i)
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    trainer.run(train_func)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 2

    def train_func_checkpoint():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 2

        for i in range(checkpoint["epoch"], 5):
            train.save_checkpoint(epoch=i)
        return 1

    trainer.run(train_func_checkpoint, checkpoint=checkpoint)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 4
Example #2
0
def test_tf_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(tf_quick_start_train_func)
    trainer.shutdown()
Example #3
0
def test_mismatch_checkpoint_report(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_mismatch():
        train.save_checkpoint(epoch=0)
        train.report(index=0)
        # skip checkpoint
        train.report(index=1)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train_func, callbacks=[callback])
    # validate checkpoint
    assert trainer.latest_checkpoint["epoch"] == 0
    # validate callback
    result_list = callback.result_list
    assert len(result_list) == 1  # 1 epoch succeeded
    intermediate_results = result_list[0]
    assert len(intermediate_results) == 2  # both workers reported
    for worker_result in intermediate_results:
        assert worker_result["index"] == 0
Example #4
0
def test_fast_slow(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_slow():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(5)
            train.report(index=i)
            time.sleep(5)

    new_backend_executor_cls = gen_new_backend_executor(train_slow)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, callbacks=[callback])

    assert trainer.latest_checkpoint["epoch"] == 1

    result_list = callback.result_list
    assert len(result_list) == 2
    for index in range(len(result_list)):
        intermediate_results = result_list[index]
        assert len(intermediate_results) == 2
        for worker_result in intermediate_results:
            assert worker_result["index"] == index
Example #5
0
def test_persisted_checkpoint(ray_start_2_cpus, logdir):
    config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(1)

    trainer = Trainer(config, num_workers=2, logdir=logdir)
    trainer.start()
    trainer.run(train_func)

    assert trainer.best_checkpoint_path is not None
    if logdir is not None:
        assert trainer.logdir == Path(logdir).expanduser().resolve()
    assert trainer.latest_checkpoint_dir.is_dir()
    assert trainer.best_checkpoint_path.is_file()
    assert trainer.best_checkpoint_path.name == f"checkpoint_{2:06d}"
    assert trainer.best_checkpoint_path.parent.name == "checkpoints"
    latest_checkpoint = trainer.latest_checkpoint

    def validate():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint == latest_checkpoint

    trainer.run(validate, checkpoint=trainer.best_checkpoint_path)
Example #6
0
def test_torch_auto_unwrap(ray_start_2_cpus):
    """Tests if underlying model from DDP is extracted when saving ckpt."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    class ValidateEncodedCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                model = result["model"]
                assert isinstance(model, torch.nn.Module) and not \
                    isinstance(model,
                               torch.nn.parallel.DistributedDataParallel)

    trainer.run(train_fn, callbacks=[ValidateEncodedCallback()])

    last_checkpoint = trainer.latest_checkpoint
    model = last_checkpoint["model"]
    assert isinstance(model, torch.nn.Module) and not \
        isinstance(model, torch.nn.parallel.DistributedDataParallel)

    trainer.shutdown()
Example #7
0
def test_torch_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without torch DDP."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(torch_quick_start_train_func)
    trainer.shutdown()
Example #8
0
def test_dataset_fault_tolerance(ray_start_4_cpus):
    dataset = ray.data.range(10)
    test_config = TestConfig()

    def train_func():
        return train.get_dataset_shard()

    def train_actor_failure():
        import sys
        sys.exit(0)

    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)

    class SingleGetDatasetShardsBackendExecutor(new_backend_executor_cls):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self._has_called_get_dataset_shards = False

        def _get_dataset_shards(self, dataset_or_dict):
            if self._has_called_get_dataset_shards:
                raise Exception
            self._has_called_get_dataset_shards = True
            return super()._get_dataset_shards(dataset_or_dict)

    with patch.object(ray.train.trainer, "BackendExecutor",
                      SingleGetDatasetShardsBackendExecutor):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, dataset=dataset)
Example #9
0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    class CudaTestBackend(TestBackend):
        share_cuda_visible_devices = True

    class CudaTestConfig(TestConfig):
        @property
        def backend_cls(self):
            return CudaTestBackend

    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=False,
                resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=True,
                resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()
Example #10
0
def test_json(monkeypatch, ray_start_4_cpus, make_temp_dir, workers_to_log,
              detailed, filename):
    if detailed:
        monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1")

    config = TestConfig()

    num_iters = 5
    num_workers = 4

    if workers_to_log is None:
        num_workers_to_log = num_workers
    elif isinstance(workers_to_log, int):
        num_workers_to_log = 1
    else:
        num_workers_to_log = len(workers_to_log)

    def train_func():
        for i in range(num_iters):
            train.report(index=i)
        return 1

    if filename is None:
        # if None, use default value
        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
    else:
        callback = JsonLoggerCallback(filename=filename,
                                      workers_to_log=workers_to_log)
    trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])
    if filename is None:
        assert str(
            callback.log_path.name) == JsonLoggerCallback._default_filename
    else:
        assert str(callback.log_path.name) == filename

    with open(callback.log_path, "r") as f:
        log = json.load(f)
    print(log)
    assert len(log) == num_iters
    assert len(log[0]) == num_workers_to_log
    assert all(len(element) == len(log[0]) for element in log)
    assert all(
        all(worker["index"] == worker[TRAINING_ITERATION] - 1
            for worker in element) for element in log)
    assert all(
        all(
            all(key in worker for key in BASIC_AUTOFILLED_KEYS)
            for worker in element) for element in log)
    if detailed:
        assert all(
            all(
                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
    else:
        assert all(
            all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
Example #11
0
 def latency(amp: bool) -> float:
     trainer = Trainer("torch", num_workers=2, use_gpu=True)
     trainer.start()
     start_time = timer()
     trainer.run(train_func, {"amp": amp})
     end_time = timer()
     trainer.shutdown()
     return end_time - start_time
Example #12
0
def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=False,
            resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()
Example #13
0
def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
    """Test that model with AMP is serializable."""
    def train_func():
        train.torch.accelerate(amp=True)

        model = torchvision.models.resnet101()
        model = train.torch.prepare_model(model)

        train.save_checkpoint(model=model)

    trainer = Trainer("torch", num_workers=1, use_gpu=True)
    trainer.start()
    trainer.run(train_func)
    trainer.shutdown()
Example #14
0
def test_torch_amp(ray_start_2_cpus):
    def train_fn():
        train.torch.accelerate(amp=True)
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)

        # Make sure model is serializable even with amp enabled.
        return model.module

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    trainer.run(train_fn)
    trainer.shutdown()
Example #15
0
def test_mlflow(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    params = {"p1": "p1"}

    temp_dir = tmp_path
    num_workers = 4

    def train_func(config):
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6)
        return 1

    callback = MLflowLoggerCallback(experiment_name="test_exp",
                                    logdir=temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, config=params, callbacks=[callback])

    from mlflow.tracking import MlflowClient

    client = MlflowClient(
        tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())

    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
    all_runs = callback.mlflow_util._mlflow.search_runs(
        experiment_ids=[experiment_id])
    assert len(all_runs) == 1
    # all_runs is a pandas dataframe.
    all_runs = all_runs.to_dict(orient="records")
    run_id = all_runs[0]["run_id"]
    run = client.get_run(run_id)

    assert run.data.params == params
    assert ("episode_reward_mean" in run.data.metrics
            and run.data.metrics["episode_reward_mean"] == 6.0)
    assert (TRAINING_ITERATION in run.data.metrics
            and run.data.metrics[TRAINING_ITERATION] == 3.0)

    metric_history = client.get_metric_history(run_id=run_id,
                                               key="episode_reward_mean")

    assert len(metric_history) == 3
    iterations = [metric.step for metric in metric_history]
    assert iterations == [1, 2, 3]
    rewards = [metric.value for metric in metric_history]
    assert rewards == [4, 5, 6]
Example #16
0
def test_dataset_pipeline(ray_start_4_cpus):
    """Checks that Pipeline is correctly sharded even with multiple epochs."""
    num_epochs = 2
    num_data = 10

    dataset = ray.data.range(num_data).repeat()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(num_epochs):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)
            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)
Example #17
0
def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker):
    @patch("torch.cuda.is_available", lambda: True)
    def train_fn():
        return train.torch.get_device().index

    trainer = Trainer(
        TorchConfig(backend="gloo"),
        num_workers=int(8 / num_gpus_per_worker),
        use_gpu=True,
        resources_per_worker={"GPU": num_gpus_per_worker},
    )
    trainer.start()
    devices = trainer.run(train_fn)
    trainer.shutdown()

    count = Counter(devices)
    if num_gpus_per_worker == 0.5:
        for i in range(4):
            assert count[i] == 4
    elif num_gpus_per_worker == 1:
        for i in range(4):
            assert count[i] == 2
    elif num_gpus_per_worker == 2:
        for i in range(2):
            assert count[2 * i] == 2
    else:
        raise RuntimeError(
            "New parameter for this test has been added without checking that the "
            "correct devices have been returned.")
Example #18
0
def test_dataset_pipeline_shuffle(ray_start_4_cpus):
    num_epochs = 2
    num_data = 20

    dataset = ray.data.range(num_data).repeat().random_shuffle_each_window()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(2):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)

            if len(data_all_epochs) > 0:
                # Make sure data is shuffled per epoch.
                assert data_this_epoch != data_all_epochs[-1]

            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)
Example #19
0
def test_run_after_user_error(ray_start_2_cpus):
    config = TestConfig()

    def fail_train():
        raise NotImplementedError

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    with pytest.raises(NotImplementedError):
        trainer.run(fail_train)

    def train_func():
        return 1

    output = trainer.run(train_func)
    assert output == [1, 1]
Example #20
0
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(
        train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    )
    trainer.shutdown()
    print(f"Results: {results[0]}")
Example #21
0
def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir):
    config = TestConfig()

    checkpoint_strategy = CheckpointStrategy(checkpoint_score_attribute="loss",
                                             checkpoint_score_order="min")

    def train_func_checkpoint():
        train.save_checkpoint(loss=3)
        train.save_checkpoint(loss=7)

    trainer = Trainer(config, num_workers=2, logdir=tmpdir)
    trainer.start()
    trainer.run(train_func_checkpoint, checkpoint_strategy=checkpoint_strategy)

    assert trainer.best_checkpoint["loss"] == 3
    assert (Trainer.load_checkpoint_from_path(
        trainer.best_checkpoint_path) == trainer.best_checkpoint)
Example #22
0
def test_multiple_run(ray_start_2_cpus):
    config = TestConfig()

    def train_1():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()

    output_1 = trainer.run(train_1)
    assert output_1 == [1, 1]

    def train_2():
        return 2

    output_2 = trainer.run(train_2)
    assert output_2 == [2, 2]
Example #23
0
def main():
    args = parse_args()
    config = {"args": args}

    if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
        if args.start_local:
            # Start a local Ray runtime.
            ray.init(num_cpus=args.num_workers)
        else:
            # Connect to a Ray cluster for distributed training.
            ray.init(address=args.address)
        trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu)
        trainer.start()
        trainer.run(train_func, config)
    else:
        # Run training locally.
        train_func(config)
Example #24
0
def test_mismatch_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)

    def train_mismatch():
        train.save_checkpoint(epoch=0)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)

    with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train_func)
Example #25
0
def test_run_failure(ray_start_2_cpus):
    test_config = TestConfig()

    def train_invalid_signature(a, b):
        pass

    trainer = Trainer(test_config, num_workers=2)

    # Raise RuntimeError when trainer has not been started yet.
    with pytest.raises(RuntimeError):
        trainer.run(lambda: 1)

    trainer.start()

    with pytest.raises(ValueError):
        trainer.run(train_invalid_signature)

    trainer.shutdown()
Example #26
0
def test_TBX(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    temp_dir = tmp_path
    num_workers = 4

    def train_func():
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
        return 1

    callback = TBXLoggerCallback(temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])

    _validate_tbx_result(temp_dir)
Example #27
0
def test_torch_prepare_model(ray_start_4_cpus_2_gpus):
    """Tests if ``prepare_model`` correctly wraps in DDP."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Make sure model is wrapped in DDP.
        assert isinstance(model, DistributedDataParallel)

        # Make sure model is on cuda.
        assert next(model.parameters()).is_cuda

    trainer = Trainer("torch", num_workers=2, use_gpu=True)
    trainer.start()
    trainer.run(train_fn)
    trainer.shutdown()
Example #28
0
def test_print(ray_start_4_cpus):
    num_workers = 4

    def train_func():
        train.report(rank=train.world_rank())

    stream = io.StringIO()
    with redirect_stdout(stream):
        trainer = Trainer(TestConfig(), num_workers=num_workers)
        trainer.start()
        trainer.run(train_func, callbacks=[PrintCallback()])
        trainer.shutdown()

    output = stream.getvalue()
    results = json.loads(output)

    assert len(results) == num_workers
    for i, result in enumerate(results):
        assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"})
        assert result["rank"] == i
Example #29
0
def test_world_rank(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return train.world_rank()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)

    assert set(results) == {0, 1}
Example #30
0
def test_env_var(ray_start_2_cpus):
    """Tests if Train env vars are propagated to the BackendExecutor."""
    config = TestConfig()

    os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] = "1"

    class EnvBackendExecutor(BackendExecutor):
        def __init__(self, *args, **kwargs):
            assert TRAIN_ENABLE_WORKER_SPREAD_ENV in os.environ and \
                   os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] == "1"
            super().__init__(*args, **kwargs)

    with patch.object(ray.train.trainer, "BackendExecutor",
                      EnvBackendExecutor):
        trainer = Trainer(config, num_workers=1)
        trainer.start()
        trainer.run(lambda: 1)
        trainer.shutdown()

    del os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV]