Python Trainer.run Examples

Programming Language: Python

Namespace/Package Name: ray.train

Class/Type: Trainer

Method/Function: run

Examples at hotexamples.com: 30

The ray.train package is a library that provides tools for training machine learning models using distributed computing. One of the main tools in ray.train is the Trainer class, which allows users to define a training loop and run it on a cluster of machines.

Here are some code examples using python ray.train Trainer run:

Example 1: Simple training loop

import ray
from ray.train import Trainer

ray.init()

def train_step(config, data):
    # Define training step here
    return {}

def eval_step(config, data):
    # Define evaluation step here
    return {}

trainer = Trainer(train_step=train_step, eval_step=eval_step)

for epoch in range(num_epochs):
    train_metrics = trainer.run(train_data)
    eval_metrics = trainer.run(eval_data)

In this example, we define a simple training loop using the Trainer class. We pass in functions for the `train_step` and `eval_step` which define what happens during training and evaluation respectively. We then run the Trainer on the training and evaluation data for a given number of epochs.

Example 2: Distributed training

import ray
from ray.train import Trainer

ray.init()

def train_step(config, data):
    # Define training step here
    return {}

def eval_step(config, data):
    # Define evaluation step here
    return {}

trainer = Trainer(train_step=train_step, eval_step=eval_step, num_workers=num_workers)

for epoch in range(num_epochs):
    train_metrics = trainer.run_distributed(train_data)
    eval_metrics = trainer.run_distributed(eval_data)

In this example, we define a distributed training loop using the Trainer class. We pass in the number of workers to use for training and evaluation. We then run the Trainer on the training and evaluation data for a given number of epochs using the `run_distributed` method. Package library: Ray (https://ray.io/)

Python Trainer.run - 30 examples found. These are the top rated real world Python examples of ray.train.Trainer.run extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Trainer(30)

run(30)

shutdown(30)

start(30)

to_tune_trainable(9)

run_iterator(6)

load_checkpoint_from_path(2)

to_worker_group(2)

Example #1

Show file

def test_checkpoint(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        assert train.load_checkpoint() is None
        for i in range(3):
            train.save_checkpoint(epoch=i)
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    trainer.run(train_func)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 2

    def train_func_checkpoint():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 2

        for i in range(checkpoint["epoch"], 5):
            train.save_checkpoint(epoch=i)
        return 1

    trainer.run(train_func_checkpoint, checkpoint=checkpoint)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 4

Example #2

Show file

File: test_examples.py Project: vishalbelsare/ray

def test_tf_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(tf_quick_start_train_func)
    trainer.shutdown()

Example #3

Show file

def test_mismatch_checkpoint_report(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_mismatch():
        train.save_checkpoint(epoch=0)
        train.report(index=0)
        # skip checkpoint
        train.report(index=1)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train_func, callbacks=[callback])
    # validate checkpoint
    assert trainer.latest_checkpoint["epoch"] == 0
    # validate callback
    result_list = callback.result_list
    assert len(result_list) == 1  # 1 epoch succeeded
    intermediate_results = result_list[0]
    assert len(intermediate_results) == 2  # both workers reported
    for worker_result in intermediate_results:
        assert worker_result["index"] == 0

Example #4

Show file

def test_fast_slow(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_slow():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(5)
            train.report(index=i)
            time.sleep(5)

    new_backend_executor_cls = gen_new_backend_executor(train_slow)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, callbacks=[callback])

    assert trainer.latest_checkpoint["epoch"] == 1

    result_list = callback.result_list
    assert len(result_list) == 2
    for index in range(len(result_list)):
        intermediate_results = result_list[index]
        assert len(intermediate_results) == 2
        for worker_result in intermediate_results:
            assert worker_result["index"] == index

Example #5

Show file

def test_persisted_checkpoint(ray_start_2_cpus, logdir):
    config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(1)

    trainer = Trainer(config, num_workers=2, logdir=logdir)
    trainer.start()
    trainer.run(train_func)

    assert trainer.best_checkpoint_path is not None
    if logdir is not None:
        assert trainer.logdir == Path(logdir).expanduser().resolve()
    assert trainer.latest_checkpoint_dir.is_dir()
    assert trainer.best_checkpoint_path.is_file()
    assert trainer.best_checkpoint_path.name == f"checkpoint_{2:06d}"
    assert trainer.best_checkpoint_path.parent.name == "checkpoints"
    latest_checkpoint = trainer.latest_checkpoint

    def validate():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint == latest_checkpoint

    trainer.run(validate, checkpoint=trainer.best_checkpoint_path)

Example #6

Show file

def test_torch_auto_unwrap(ray_start_2_cpus):
    """Tests if underlying model from DDP is extracted when saving ckpt."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    class ValidateEncodedCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                model = result["model"]
                assert isinstance(model, torch.nn.Module) and not \
                    isinstance(model,
                               torch.nn.parallel.DistributedDataParallel)

    trainer.run(train_fn, callbacks=[ValidateEncodedCallback()])

    last_checkpoint = trainer.latest_checkpoint
    model = last_checkpoint["model"]
    assert isinstance(model, torch.nn.Module) and not \
        isinstance(model, torch.nn.parallel.DistributedDataParallel)

    trainer.shutdown()

Example #7

Show file

File: test_examples.py Project: vishalbelsare/ray

def test_torch_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without torch DDP."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(torch_quick_start_train_func)
    trainer.shutdown()

Example #8

Show file

def test_dataset_fault_tolerance(ray_start_4_cpus):
    dataset = ray.data.range(10)
    test_config = TestConfig()

    def train_func():
        return train.get_dataset_shard()

    def train_actor_failure():
        import sys
        sys.exit(0)

    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)

    class SingleGetDatasetShardsBackendExecutor(new_backend_executor_cls):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self._has_called_get_dataset_shards = False

        def _get_dataset_shards(self, dataset_or_dict):
            if self._has_called_get_dataset_shards:
                raise Exception
            self._has_called_get_dataset_shards = True
            return super()._get_dataset_shards(dataset_or_dict)

    with patch.object(ray.train.trainer, "BackendExecutor",
                      SingleGetDatasetShardsBackendExecutor):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, dataset=dataset)

Example #9

Show file

def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    class CudaTestBackend(TestBackend):
        share_cuda_visible_devices = True

    class CudaTestConfig(TestConfig):
        @property
        def backend_cls(self):
            return CudaTestBackend

    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=False,
                resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=True,
                resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()

Example #10

Show file

def test_json(monkeypatch, ray_start_4_cpus, make_temp_dir, workers_to_log,
              detailed, filename):
    if detailed:
        monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1")

    config = TestConfig()

    num_iters = 5
    num_workers = 4

    if workers_to_log is None:
        num_workers_to_log = num_workers
    elif isinstance(workers_to_log, int):
        num_workers_to_log = 1
    else:
        num_workers_to_log = len(workers_to_log)

    def train_func():
        for i in range(num_iters):
            train.report(index=i)
        return 1

    if filename is None:
        # if None, use default value
        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
    else:
        callback = JsonLoggerCallback(filename=filename,
                                      workers_to_log=workers_to_log)
    trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])
    if filename is None:
        assert str(
            callback.log_path.name) == JsonLoggerCallback._default_filename
    else:
        assert str(callback.log_path.name) == filename

    with open(callback.log_path, "r") as f:
        log = json.load(f)
    print(log)
    assert len(log) == num_iters
    assert len(log[0]) == num_workers_to_log
    assert all(len(element) == len(log[0]) for element in log)
    assert all(
        all(worker["index"] == worker[TRAINING_ITERATION] - 1
            for worker in element) for element in log)
    assert all(
        all(
            all(key in worker for key in BASIC_AUTOFILLED_KEYS)
            for worker in element) for element in log)
    if detailed:
        assert all(
            all(
                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
    else:
        assert all(
            all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)

Example #11

Show file

File: test_gpu.py Project: parasj/ray

 def latency(amp: bool) -> float:
     trainer = Trainer("torch", num_workers=2, use_gpu=True)
     trainer.start()
     start_time = timer()
     trainer.run(train_func, {"amp": amp})
     end_time = timer()
     trainer.shutdown()
     return end_time - start_time

Example #12

Show file

def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=False,
            resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(
            TestConfig(),
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(TestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(
        TestConfig(),
        num_workers=2,
        use_gpu=True,
        resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()

Example #13

Show file

File: test_gpu.py Project: parasj/ray

def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
    """Test that model with AMP is serializable."""
    def train_func():
        train.torch.accelerate(amp=True)

        model = torchvision.models.resnet101()
        model = train.torch.prepare_model(model)

        train.save_checkpoint(model=model)

    trainer = Trainer("torch", num_workers=1, use_gpu=True)
    trainer.start()
    trainer.run(train_func)
    trainer.shutdown()

Example #14

Show file

File: test_trainer.py Project: ray-project/ray

def test_torch_amp(ray_start_2_cpus):
    def train_fn():
        train.torch.accelerate(amp=True)
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)

        # Make sure model is serializable even with amp enabled.
        return model.module

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    trainer.run(train_fn)
    trainer.shutdown()

Example #15

Show file

def test_mlflow(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    params = {"p1": "p1"}

    temp_dir = tmp_path
    num_workers = 4

    def train_func(config):
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6)
        return 1

    callback = MLflowLoggerCallback(experiment_name="test_exp",
                                    logdir=temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, config=params, callbacks=[callback])

    from mlflow.tracking import MlflowClient

    client = MlflowClient(
        tracking_uri=callback.mlflow_util._mlflow.get_tracking_uri())

    experiment_id = client.get_experiment_by_name("test_exp").experiment_id
    all_runs = callback.mlflow_util._mlflow.search_runs(
        experiment_ids=[experiment_id])
    assert len(all_runs) == 1
    # all_runs is a pandas dataframe.
    all_runs = all_runs.to_dict(orient="records")
    run_id = all_runs[0]["run_id"]
    run = client.get_run(run_id)

    assert run.data.params == params
    assert ("episode_reward_mean" in run.data.metrics
            and run.data.metrics["episode_reward_mean"] == 6.0)
    assert (TRAINING_ITERATION in run.data.metrics
            and run.data.metrics[TRAINING_ITERATION] == 3.0)

    metric_history = client.get_metric_history(run_id=run_id,
                                               key="episode_reward_mean")

    assert len(metric_history) == 3
    iterations = [metric.step for metric in metric_history]
    assert iterations == [1, 2, 3]
    rewards = [metric.value for metric in metric_history]
    assert rewards == [4, 5, 6]

Example #16

Show file

def test_dataset_pipeline(ray_start_4_cpus):
    """Checks that Pipeline is correctly sharded even with multiple epochs."""
    num_epochs = 2
    num_data = 10

    dataset = ray.data.range(num_data).repeat()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(num_epochs):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)
            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)

Example #17

Show file

File: test_gpu.py Project: parasj/ray

def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker):
    @patch("torch.cuda.is_available", lambda: True)
    def train_fn():
        return train.torch.get_device().index

    trainer = Trainer(
        TorchConfig(backend="gloo"),
        num_workers=int(8 / num_gpus_per_worker),
        use_gpu=True,
        resources_per_worker={"GPU": num_gpus_per_worker},
    )
    trainer.start()
    devices = trainer.run(train_fn)
    trainer.shutdown()

    count = Counter(devices)
    if num_gpus_per_worker == 0.5:
        for i in range(4):
            assert count[i] == 4
    elif num_gpus_per_worker == 1:
        for i in range(4):
            assert count[i] == 2
    elif num_gpus_per_worker == 2:
        for i in range(2):
            assert count[2 * i] == 2
    else:
        raise RuntimeError(
            "New parameter for this test has been added without checking that the "
            "correct devices have been returned.")

Example #18

Show file

def test_dataset_pipeline_shuffle(ray_start_4_cpus):
    num_epochs = 2
    num_data = 20

    dataset = ray.data.range(num_data).repeat().random_shuffle_each_window()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(2):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)

            if len(data_all_epochs) > 0:
                # Make sure data is shuffled per epoch.
                assert data_this_epoch != data_all_epochs[-1]

            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)

Example #19

Show file

def test_run_after_user_error(ray_start_2_cpus):
    config = TestConfig()

    def fail_train():
        raise NotImplementedError

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    with pytest.raises(NotImplementedError):
        trainer.run(fail_train)

    def train_func():
        return 1

    output = trainer.run(train_func)
    assert output == [1, 1]

Example #20

Show file

def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(
        train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    )
    trainer.shutdown()
    print(f"Results: {results[0]}")

Example #21

Show file

def test_load_checkpoint_from_path(ray_start_2_cpus, tmpdir):
    config = TestConfig()

    checkpoint_strategy = CheckpointStrategy(checkpoint_score_attribute="loss",
                                             checkpoint_score_order="min")

    def train_func_checkpoint():
        train.save_checkpoint(loss=3)
        train.save_checkpoint(loss=7)

    trainer = Trainer(config, num_workers=2, logdir=tmpdir)
    trainer.start()
    trainer.run(train_func_checkpoint, checkpoint_strategy=checkpoint_strategy)

    assert trainer.best_checkpoint["loss"] == 3
    assert (Trainer.load_checkpoint_from_path(
        trainer.best_checkpoint_path) == trainer.best_checkpoint)

Example #22

Show file

def test_multiple_run(ray_start_2_cpus):
    config = TestConfig()

    def train_1():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()

    output_1 = trainer.run(train_1)
    assert output_1 == [1, 1]

    def train_2():
        return 2

    output_2 = trainer.run(train_2)
    assert output_2 == [2, 2]

Example #23

Show file

File: transformers_example.py Project: wuisawesome/ray

def main():
    args = parse_args()
    config = {"args": args}

    if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
        if args.start_local:
            # Start a local Ray runtime.
            ray.init(num_cpus=args.num_workers)
        else:
            # Connect to a Ray cluster for distributed training.
            ray.init(address=args.address)
        trainer = Trainer("torch", num_workers=args.num_workers, use_gpu=args.use_gpu)
        trainer.start()
        trainer.run(train_func, config)
    else:
        # Run training locally.
        train_func(config)

Example #24

Show file

def test_mismatch_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)

    def train_mismatch():
        train.save_checkpoint(epoch=0)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)

    with patch.object(ray.train.trainer, "BackendExecutor", new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train_func)

Example #25

Show file

def test_run_failure(ray_start_2_cpus):
    test_config = TestConfig()

    def train_invalid_signature(a, b):
        pass

    trainer = Trainer(test_config, num_workers=2)

    # Raise RuntimeError when trainer has not been started yet.
    with pytest.raises(RuntimeError):
        trainer.run(lambda: 1)

    trainer.start()

    with pytest.raises(ValueError):
        trainer.run(train_invalid_signature)

    trainer.shutdown()

Example #26

Show file

File: test_callbacks.py Project: tchordia/ray

def test_TBX(ray_start_4_cpus, tmp_path):
    config = TestConfig()

    temp_dir = tmp_path
    num_workers = 4

    def train_func():
        train.report(episode_reward_mean=4)
        train.report(episode_reward_mean=5)
        train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1})
        return 1

    callback = TBXLoggerCallback(temp_dir)
    trainer = Trainer(config, num_workers=num_workers)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])

    _validate_tbx_result(temp_dir)

Example #27

Show file

File: test_gpu.py Project: parasj/ray

def test_torch_prepare_model(ray_start_4_cpus_2_gpus):
    """Tests if ``prepare_model`` correctly wraps in DDP."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Make sure model is wrapped in DDP.
        assert isinstance(model, DistributedDataParallel)

        # Make sure model is on cuda.
        assert next(model.parameters()).is_cuda

    trainer = Trainer("torch", num_workers=2, use_gpu=True)
    trainer.start()
    trainer.run(train_fn)
    trainer.shutdown()

Example #28

Show file

def test_print(ray_start_4_cpus):
    num_workers = 4

    def train_func():
        train.report(rank=train.world_rank())

    stream = io.StringIO()
    with redirect_stdout(stream):
        trainer = Trainer(TestConfig(), num_workers=num_workers)
        trainer.start()
        trainer.run(train_func, callbacks=[PrintCallback()])
        trainer.shutdown()

    output = stream.getvalue()
    results = json.loads(output)

    assert len(results) == num_workers
    for i, result in enumerate(results):
        assert set(result.keys()) == (BASIC_AUTOFILLED_KEYS | {"rank"})
        assert result["rank"] == i

Example #29

Show file

def test_world_rank(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return train.world_rank()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)

    assert set(results) == {0, 1}

Example #30

Show file

def test_env_var(ray_start_2_cpus):
    """Tests if Train env vars are propagated to the BackendExecutor."""
    config = TestConfig()

    os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] = "1"

    class EnvBackendExecutor(BackendExecutor):
        def __init__(self, *args, **kwargs):
            assert TRAIN_ENABLE_WORKER_SPREAD_ENV in os.environ and \
                   os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] == "1"
            super().__init__(*args, **kwargs)

    with patch.object(ray.train.trainer, "BackendExecutor",
                      EnvBackendExecutor):
        trainer = Trainer(config, num_workers=1)
        trainer.start()
        trainer.run(lambda: 1)
        trainer.shutdown()

    del os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV]