Python Trainer.Trainer Examples

Programming Language: Python

Namespace/Package Name: ray.train

Class/Type: Trainer

Method/Function: Trainer

Examples at hotexamples.com: 30

The `ray.train.Trainer` class is a class in the Ray library which is used for training deep learning models. It provides a simple interface for specifying the training loop, optimization algorithm, and other parameters of the training process.

Here are some examples of how to use the `ray.train.Trainer` class in Python:

Example 1:

This example uses the `ray.train.Trainer` class to train a simple neural network for image classification using the MNIST dataset.

import numpy as np
from tensorflow.keras.datasets import mnist
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.keras import TuneReportCallback
from ray.train import Trainer

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Define the neural network model using Keras
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

# Define the callback for reporting metrics to Tune
tune_callback = TuneReportCallback(
    {
        "loss": "val_loss"
    },
    on="validation_end"
)

# Define the Trainer object for training the model
trainer = Trainer(
    model=model,
    config={
        "optimizer": keras.optimizers.Adam(lr=tune.choice([1e-2, 1e-3, 1e-4])),
        "loss": "sparse_categorical_crossentropy",
        "metrics": ["accuracy"],
        "batch_size": tune.choice([32, 64, 128])
    },
    train_data=np.expand_dims(x_train, axis=-1),
    train_label=y_train,
    validation_data=(np.expand_dims(x_test, axis=-1), y_test),
    batch_size=trainer_config["batch_size"],
    callbacks=[tune_callback],
    scheduler=ASHAScheduler(
        max_t=max_epochs,
        grace_period=2,
        reduction_factor=2
    ),
    num_workers=2,
    use_gpu=True
)

# Train the model
trainer.fit()

This example uses the `ray.train.Trainer` class to train a simple neural network for image classification using the MNIST dataset. It uses the `ray.tune` package to perform hyperparameter tuning of the learning rate and batch size parameters. It also uses the `ray.tune.integration.keras` package to automatically report metrics to the Ray Tune dashboard during training.

Package Library: Ray

Example 2:

This example uses the `ray.train.Trainer` class to train a reinforcement learning agent using OpenAI Gym.

import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.registry import register_env
from ray.train import Trainer

# Define the function for creating the Gym environment
def create_env(env_config):
    import gym
    return gym.make(env_config["env_id"])

# Register the Gym environment with Ray
register_env("CustomEnv-v0", create_env)

# Define the configuration for the PPO agent
config = {
    "env": "CustomEnv-v0",
    "lr": 1e-4,
    "num_workers": 2,
}

# Define the Trainer object for training the agent
trainer = ppo.PPOTrainer(config=config)

# Train the agent
for i in range(1000):
    result = trainer.train()

This example uses the `ray.train.Trainer` class to train a reinforcement learning agent using the PPO algorithm and the OpenAI Gym environment. It uses the `ray.rllib` package to define and configure the PPO agent, and the `Ray.tune.registry` package to register the Gym environment with Ray. Package Library: Ray

Python Trainer.Trainer - 30 examples found. These are the top rated real world Python examples of ray.train.Trainer.Trainer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Trainer(30)

run(30)

shutdown(30)

start(30)

to_tune_trainable(9)

run_iterator(6)

load_checkpoint_from_path(2)

to_worker_group(2)

Example #1

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_dataset_fault_tolerance(ray_start_4_cpus):
    dataset = ray.data.range(10)
    test_config = TestConfig()

    def train_func():
        return train.get_dataset_shard()

    def train_actor_failure():
        import sys
        sys.exit(0)

    new_backend_executor_cls = gen_new_backend_executor(train_actor_failure)

    class SingleGetDatasetShardsBackendExecutor(new_backend_executor_cls):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self._has_called_get_dataset_shards = False

        def _get_dataset_shards(self, dataset_or_dict):
            if self._has_called_get_dataset_shards:
                raise Exception
            self._has_called_get_dataset_shards = True
            return super()._get_dataset_shards(dataset_or_dict)

    with patch.object(ray.train.trainer, "BackendExecutor",
                      SingleGetDatasetShardsBackendExecutor):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, dataset=dataset)

Example #2

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_dataset_pipeline(ray_start_4_cpus):
    """Checks that Pipeline is correctly sharded even with multiple epochs."""
    num_epochs = 2
    num_data = 10

    dataset = ray.data.range(num_data).repeat()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(num_epochs):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)
            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)

Example #3

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_dataset_pipeline_shuffle(ray_start_4_cpus):
    num_epochs = 2
    num_data = 20

    dataset = ray.data.range(num_data).repeat().random_shuffle_each_window()

    def get_dataset():
        pipeline_iterator = train.get_dataset_shard().iter_epochs()
        data_all_epochs = []
        for _ in range(2):
            dataset_this_epoch = next(pipeline_iterator)
            data_this_epoch = []
            for batch in dataset_this_epoch.iter_batches(
                    batch_format="native"):
                data_this_epoch.extend(batch)

            if len(data_all_epochs) > 0:
                # Make sure data is shuffled per epoch.
                assert data_this_epoch != data_all_epochs[-1]

            data_all_epochs.append(data_this_epoch)
        return data_all_epochs

    config = TestConfig()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(get_dataset, dataset=dataset)
    check_dataset_output(num_data, num_epochs, results)

Example #4

Show file

File: test_trainer.py Project: ray-project/ray

def test_persisted_checkpoint_strategy_failure(ray_start_2_cpus):
    logdir = "/tmp/test/trainer/test_persisted_checkpoint_strategy_failure"
    config = TestConfig()

    def train_func():
        train.save_checkpoint(epoch=0)

    trainer = Trainer(config, num_workers=2, logdir=logdir)
    trainer.start()

    with pytest.raises(ValueError):
        trainer.run(train_func,
                    checkpoint_strategy=CheckpointConfig(num_to_keep=-1))

    with pytest.raises(ValueError):
        trainer.run(
            train_func,
            checkpoint_strategy=CheckpointConfig(
                checkpoint_score_order="invalid_order"),
        )

    with pytest.raises(ValueError):
        trainer.run(
            train_func,
            checkpoint_strategy=CheckpointConfig(
                checkpoint_score_attribute="missing_attribute"),
        )

Example #5

Show file

File: test_trainer.py Project: ray-project/ray

def test_torch_auto_unwrap(ray_start_2_cpus):
    """Tests if underlying model from DDP is extracted when saving ckpt."""
    def train_fn():
        model = torch.nn.Linear(1, 1)

        # Wrap in DDP.
        model = train.torch.prepare_model(model)

        # Save DDP wrapped model.
        train.save_checkpoint(model=model)

        # Report DDP wrapped model.
        train.report(model=model)

    num_workers = 2
    trainer = Trainer("torch", num_workers)
    trainer.start()

    class ValidateEncodedCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                model = result["model"]
                assert isinstance(model, torch.nn.Module) and not isinstance(
                    model, torch.nn.parallel.DistributedDataParallel)

    trainer.run(train_fn, callbacks=[ValidateEncodedCallback()])

    last_checkpoint = trainer.latest_checkpoint
    model = last_checkpoint["model"]
    assert isinstance(model, torch.nn.Module) and not isinstance(
        model, torch.nn.parallel.DistributedDataParallel)

    trainer.shutdown()

Example #6

Show file

File: test_tune.py Project: RuofanKong/ray

def test_retry(ray_start_2_cpus):
    def train_func():
        ckpt = train.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    analysis = tune.run(TestTrainable, max_failures=3)
    last_ckpt = analysis.trials[0].checkpoint.value
    checkpoint_file = os.path.join(last_ckpt, TUNE_CHECKPOINT_FILE_NAME)
    assert os.path.exists(checkpoint_file)
    with open(checkpoint_file, "rb") as f:
        checkpoint = cloudpickle.load(f)
        assert checkpoint["iter"] == 3
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4

Example #7

Show file

def test_tf_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(tf_quick_start_train_func)
    trainer.shutdown()

Example #8

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_checkpoint(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        assert train.load_checkpoint() is None
        for i in range(3):
            train.save_checkpoint(epoch=i)
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    trainer.run(train_func)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 2

    def train_func_checkpoint():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint["epoch"] == 2

        for i in range(checkpoint["epoch"], 5):
            train.save_checkpoint(epoch=i)
        return 1

    trainer.run(train_func_checkpoint, checkpoint=checkpoint)
    checkpoint = trainer.latest_checkpoint

    assert checkpoint is not None
    assert checkpoint["epoch"] == 4

Example #9

Show file

File: test_tune.py Project: RuofanKong/ray

def test_reuse_checkpoint(ray_start_2_cpus):
    def train_func(config):
        itr = 0
        ckpt = train.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
    last_ckpt = trial.checkpoint.value
    checkpoint_file = os.path.join(last_ckpt, TUNE_CHECKPOINT_FILE_NAME)
    assert os.path.exists(checkpoint_file)
    with open(checkpoint_file, "rb") as f:
        checkpoint = cloudpickle.load(f)
        assert checkpoint["iter"] == 4
    analysis = tune.run(
        TestTrainable, config={"max_iter": 10}, restore=last_ckpt)
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5

Example #10

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_mismatch_checkpoint_report(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_mismatch():
        train.save_checkpoint(epoch=0)
        train.report(index=0)
        # skip checkpoint
        train.report(index=1)

    new_backend_executor_cls = gen_new_backend_executor(train_mismatch)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        with pytest.raises(RuntimeError):
            trainer.run(train_func, callbacks=[callback])
    # validate checkpoint
    assert trainer.latest_checkpoint["epoch"] == 0
    # validate callback
    result_list = callback.result_list
    assert len(result_list) == 1  # 1 epoch succeeded
    intermediate_results = result_list[0]
    assert len(intermediate_results) == 2  # both workers reported
    for worker_result in intermediate_results:
        assert worker_result["index"] == 0

Example #11

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_persisted_checkpoint(ray_start_2_cpus, logdir):
    config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(1)

    trainer = Trainer(config, num_workers=2, logdir=logdir)
    trainer.start()
    trainer.run(train_func)

    assert trainer.best_checkpoint_path is not None
    if logdir is not None:
        assert trainer.logdir == Path(logdir).expanduser().resolve()
    assert trainer.latest_checkpoint_dir.is_dir()
    assert trainer.best_checkpoint_path.is_file()
    assert trainer.best_checkpoint_path.name == f"checkpoint_{2:06d}"
    assert trainer.best_checkpoint_path.parent.name == "checkpoints"
    latest_checkpoint = trainer.latest_checkpoint

    def validate():
        checkpoint = train.load_checkpoint()
        assert checkpoint is not None
        assert checkpoint == latest_checkpoint

    trainer.run(validate, checkpoint=trainer.best_checkpoint_path)

Example #12

Show file

def test_torch_non_distributed(ray_start_2_cpus):
    """Make sure Ray Train works without torch DDP."""

    trainer = Trainer(backend="torch", num_workers=1)
    trainer.start()
    trainer.run(torch_quick_start_train_func)
    trainer.shutdown()

Example #13

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_to_worker_group(ray_start_2_cpus):
    config = TestConfig()
    trainer = Trainer(config, num_workers=2)

    class Incrementer:
        def __init__(self, starting=0):
            self.count = starting

        def increment(self):
            self.count += 1

        def get_count(self):
            return self.count

    workers = trainer.to_worker_group(Incrementer, starting=2)
    assert ray.get([w.get_count.remote() for w in workers]) == [2, 2]

    ray.get([w.increment.remote() for w in workers])
    assert ray.get([w.get_count.remote() for w in workers]) == [3, 3]

    ray.get(workers[0].increment.remote())
    assert ray.get([w.get_count.remote() for w in workers]) == [4, 3]

    ray.get(workers[1].increment.remote())
    assert ray.get([w.get_count.remote() for w in workers]) == [4, 4]

Example #14

Show file

File: test_trainer.py Project: stefanbschneider/ray

def test_fast_slow(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            train.report(index=i)

    def train_slow():
        for i in range(2):
            train.save_checkpoint(epoch=i)
            time.sleep(5)
            train.report(index=i)
            time.sleep(5)

    new_backend_executor_cls = gen_new_backend_executor(train_slow)
    callback = TestCallback()

    with patch.object(ray.train.trainer, "BackendExecutor",
                      new_backend_executor_cls):
        trainer = Trainer(test_config, num_workers=2)
        trainer.start()
        trainer.run(train_func, callbacks=[callback])

    assert trainer.latest_checkpoint["epoch"] == 1

    result_list = callback.result_list
    assert len(result_list) == 2
    for index in range(len(result_list)):
        intermediate_results = result_list[index]
        assert len(intermediate_results) == 2
        for worker_result in intermediate_results:
            assert worker_result["index"] == index

Example #15

Show file

def test_json(monkeypatch, ray_start_4_cpus, make_temp_dir, workers_to_log,
              detailed, filename):
    if detailed:
        monkeypatch.setenv(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, "1")

    config = TestConfig()

    num_iters = 5
    num_workers = 4

    if workers_to_log is None:
        num_workers_to_log = num_workers
    elif isinstance(workers_to_log, int):
        num_workers_to_log = 1
    else:
        num_workers_to_log = len(workers_to_log)

    def train_func():
        for i in range(num_iters):
            train.report(index=i)
        return 1

    if filename is None:
        # if None, use default value
        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
    else:
        callback = JsonLoggerCallback(filename=filename,
                                      workers_to_log=workers_to_log)
    trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])
    if filename is None:
        assert str(
            callback.log_path.name) == JsonLoggerCallback._default_filename
    else:
        assert str(callback.log_path.name) == filename

    with open(callback.log_path, "r") as f:
        log = json.load(f)
    print(log)
    assert len(log) == num_iters
    assert len(log[0]) == num_workers_to_log
    assert all(len(element) == len(log[0]) for element in log)
    assert all(
        all(worker["index"] == worker[TRAINING_ITERATION] - 1
            for worker in element) for element in log)
    assert all(
        all(
            all(key in worker for key in BASIC_AUTOFILLED_KEYS)
            for worker in element) for element in log)
    if detailed:
        assert all(
            all(
                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
    else:
        assert all(
            all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)

Example #16

Show file

File: test_gpu.py Project: parasj/ray

 def latency(amp: bool) -> float:
     trainer = Trainer("torch", num_workers=2, use_gpu=True)
     trainer.start()
     start_time = timer()
     trainer.run(train_func, {"amp": amp})
     end_time = timer()
     trainer.shutdown()
     return end_time - start_time

Example #17

Show file

def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = Trainer(backend="tensorflow", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    results = trainer.run(
        train_func=train_func, config={"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    )
    trainer.shutdown()
    print(f"Results: {results[0]}")

Example #18

Show file

File: test_tune.py Project: RuofanKong/ray

def test_tune_error(ray_start_2_cpus):
    def train_func(config):
        raise RuntimeError("Error in training function!")

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    with pytest.raises(TuneError):
        tune.run(TestTrainable)

Example #19

Show file

def test_start_max_failures(ray_start_2_cpus):
    test_config = TestConfig()

    trainer = Trainer(test_config, num_workers=2)

    def init_hook_fail():
        import sys
        sys.exit(0)

    with pytest.raises(RuntimeError):
        trainer.start(initialization_hook=init_hook_fail)

Example #20

Show file

def test_world_rank(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return train.world_rank()

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)

    assert set(results) == {0, 1}

Example #21

Show file

File: train_linear_example.py Project: wuisawesome/ray

def train_linear(num_workers=2, use_gpu=False, epochs=3):
    trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer.start()
    results = trainer.run(
        train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]
    )
    trainer.shutdown()

    print(results)
    return results

Example #22

Show file

def test_horovod_simple(ray_start_2_cpus):
    def simple_fn():
        hvd_torch.init()
        return hvd_torch.rank()

    num_workers = 2
    trainer = Trainer("horovod", num_workers)
    trainer.start()
    result = trainer.run(simple_fn)
    trainer.shutdown()

    assert result == list(range(num_workers))

Example #23

Show file

def test_worker_kill_checkpoint(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        checkpoint = train.load_checkpoint()
        if checkpoint:
            epoch = checkpoint["epoch"]
        else:
            epoch = 0
        print("Epoch: ", epoch)
        for i in range(epoch, 2):
            train.report(loss=1, iter=i)
            train.save_checkpoint(epoch=i + 1)

    trainer = Trainer(test_config, num_workers=2)
    trainer.start()
    kill_callback = KillCallback(
        fail_on=0, worker_group=trainer._executor.worker_group)

    trainer.run(train_func, callbacks=[kill_callback])

    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint is saved.*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from beginning*
    # Run 2: epoch=0, counter=2, Successful
    # Run 3: epoch=1, counter=3, Successful
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    trainer.shutdown()
    trainer.start()

    kill_callback = KillCallback(
        fail_on=1, worker_group=trainer._executor.worker_group)
    trainer.run(train_func, callbacks=[kill_callback])
    # Run 1: epoch=0, counter=1, Successful
    # *Checkpoint saved*
    # *Latest checkpoint updated, epoch=1
    # Run 2: epoch=1, counter=2, Successful
    # *Checkpoint saved*
    # *Worker is killed*
    # *Getting checkpoint fails. Workers are restarted from last checkpoint.*
    # Run 3: epoch=1, counter=3, Successful.
    assert kill_callback.counter == 3
    assert trainer.latest_checkpoint["epoch"] == 2

    def train_func():
        return 1

    # Make sure Trainer is usable even after failure handling.
    trainer.run(train_func)

Example #24

Show file

def test_run(ray_start_2_cpus):
    config = TestConfig()

    def train_func():
        return 1

    trainer = Trainer(config, num_workers=2)
    trainer.start()
    results = trainer.run(train_func)
    trainer.shutdown()

    assert len(results) == 2
    assert all(result == 1 for result in results)

Example #25

Show file

def test_gpu_requests(ray_start_4_cpus_4_gpus_4_extra):
    class CudaTestBackend(TestBackend):
        share_cuda_visible_devices = True

    class CudaTestConfig(TestConfig):
        @property
        def backend_cls(self):
            return CudaTestBackend

    # GPUs should not be requested if `use_gpu` is False.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=False,
                resources_per_worker={"GPU": 1})

    # GPUs should not be set to 0 if `use_gpu` is True.
    with pytest.raises(ValueError):
        Trainer(CudaTestConfig(),
                num_workers=2,
                use_gpu=True,
                resources_per_worker={"GPU": 0})

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    # 0 GPUs will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=False)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["", ""]
    trainer.shutdown()

    # 1 GPU will be requested and should not raise an error.
    trainer = Trainer(CudaTestConfig(), num_workers=2, use_gpu=True)
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1", "0,1"]
    trainer.shutdown()

    # Partial GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 0.1})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0", "0"]
    trainer.shutdown()

    # Multiple GPUs should not raise an error.
    trainer = Trainer(CudaTestConfig(),
                      num_workers=2,
                      use_gpu=True,
                      resources_per_worker={"GPU": 2})
    trainer.start()
    result = trainer.run(get_resources)
    assert result == ["0,1,2,3", "0,1,2,3"]
    trainer.shutdown()

Example #26

Show file

def tune_linear(num_workers, num_samples):
    trainer = Trainer("torch", num_workers=num_workers)
    Trainable = trainer.to_tune_trainable(train_func)
    analysis = tune.run(Trainable,
                        num_samples=num_samples,
                        config={
                            "lr": tune.loguniform(1e-4, 1e-1),
                            "batch_size": tune.choice([4, 16, 32]),
                            "epochs": 3
                        })
    results = analysis.get_best_config(metric="loss", mode="min")
    print(results)
    return results

Example #27

Show file

File: train_linear_example.py Project: eggie5/ray

def train_linear(num_workers=2):
    trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers)
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer.start()
    results = trainer.run(
        train_func,
        config,
        callbacks=[JsonLoggerCallback(),
                   TBXLoggerCallback()])
    trainer.shutdown()

    print(results)
    return results

Example #28

Show file

def test_max_failures(ray_start_2_cpus):
    test_config = TestConfig()

    def train_func():
        import sys
        sys.exit(0)

    trainer = Trainer(test_config, num_workers=2)
    trainer.start()
    iterator = trainer.run_iterator(train_func)
    with pytest.raises(RuntimeError):
        iterator.get_final_results(force=True)
    assert iterator._executor._num_failures == 3

Example #29

Show file

File: test_gpu.py Project: parasj/ray

def test_checkpoint_torch_model_with_amp(ray_start_4_cpus_2_gpus):
    """Test that model with AMP is serializable."""
    def train_func():
        train.torch.accelerate(amp=True)

        model = torchvision.models.resnet101()
        model = train.torch.prepare_model(model)

        train.save_checkpoint(model=model)

    trainer = Trainer("torch", num_workers=1, use_gpu=True)
    trainer.start()
    trainer.run(train_func)
    trainer.shutdown()

Example #30

Show file

def test_horovod_torch_mnist_gpu(ray_start_4_cpus_2_gpus):
    num_workers = 2
    num_epochs = 2
    trainer = Trainer("horovod", num_workers, use_gpu=True)
    trainer.start()
    results = trainer.run(
        horovod_torch_train_func, config={"num_epochs": num_epochs, "lr": 1e-3}
    )
    trainer.shutdown()

    assert len(results) == num_workers
    for worker_result in results:
        assert len(worker_result) == num_epochs
        assert worker_result[num_epochs - 1] < worker_result[0]