Python Trainer Examples

Programming Language: Python

Namespace/Package Name: ray.train

Class/Type: Trainer

Examples at hotexamples.com: 16

Ray is an open-source framework that offers a simple and flexible approach to building distributed applications. The Ray library provides a Trainer class that enables users to easily train machine learning models on large datasets in parallel.

One example of using the Trainer class is for training a deep reinforcement learning agent using the Proximal Policy Optimization (PPO) algorithm. The code below shows how to set up a Trainer object and use it to train an agent on the CartPole environment:

import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.tune.registry import register_env

ray.init()

def env_creator(env_config):
    import gym
    return gym.make('CartPole-v0')

register_env('CartPole-v0', env_creator)

config = {'env': 'CartPole-v0', 'num_workers': 4}

trainer = PPOTrainer(config=config)
for i in range(100):
    results = trainer.train()

In this example, Ray is used to create a distributed training setup with four workers. The Trainer object is initialized with configuration that includes the name of the environment to use and the number of workers to use. The `train` method is then called repeatedly to train the agent for 100 iterations. The Ray library is used in this example along with the Ray Tune package, which provides a number of utilities for hyperparameter tuning and distributed training. In summary, the ray.train Trainer class is a powerful tool for training machine learning models on large datasets in a distributed environment. It offers a simple and flexible approach to building high-performance models on a variety of machine learning platforms.

Python Trainer - 16 examples found. These are the top rated real world Python examples of ray.train.Trainer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Trainer(30)

run(30)

shutdown(30)

start(30)

to_tune_trainable(9)

run_iterator(6)

load_checkpoint_from_path(2)

to_worker_group(2)

Example #1

Show file

import json
import os
import time

import ray
from ray.train import Trainer
from ray.train.examples.horovod.horovod_example import (
    train_func as horovod_torch_train_func, )

if __name__ == "__main__":
    ray.init(address=os.environ.get("RAY_ADDRESS", "auto"))
    start_time = time.time()

    num_workers = 8
    num_epochs = 10
    trainer = Trainer("horovod", num_workers)
    trainer.start()
    results = trainer.run(horovod_torch_train_func,
                          config={
                              "num_epochs": num_epochs,
                              "lr": 1e-3
                          })
    trainer.shutdown()

    assert len(results) == num_workers
    for worker_result in results:
        assert len(worker_result) == num_epochs
        assert worker_result[num_epochs - 1] < worker_result[0]

    delta = time.time() - start_time
    with open(os.environ["TEST_OUTPUT_JSON"], "w") as f:

Example #2

Show file

def test_json(ray_start_4_cpus, make_temp_dir, workers_to_log, detailed,
              filename):
    if detailed:
        os.environ[ENABLE_DETAILED_AUTOFILLED_METRICS_ENV] = "1"

    config = TestConfig()

    num_iters = 5
    num_workers = 4

    if workers_to_log is None:
        num_workers_to_log = num_workers
    elif isinstance(workers_to_log, int):
        num_workers_to_log = 1
    else:
        num_workers_to_log = len(workers_to_log)

    def train_func():
        for i in range(num_iters):
            train.report(index=i)
        return 1

    if filename is None:
        # if None, use default value
        callback = JsonLoggerCallback(workers_to_log=workers_to_log)
    else:
        callback = JsonLoggerCallback(filename=filename,
                                      workers_to_log=workers_to_log)
    trainer = Trainer(config, num_workers=num_workers, logdir=make_temp_dir)
    trainer.start()
    trainer.run(train_func, callbacks=[callback])
    if filename is None:
        assert str(
            callback.log_path.name) == JsonLoggerCallback._default_filename
    else:
        assert str(callback.log_path.name) == filename

    with open(callback.log_path, "r") as f:
        log = json.load(f)
    print(log)
    assert len(log) == num_iters
    assert len(log[0]) == num_workers_to_log
    assert all(len(element) == len(log[0]) for element in log)
    assert all(
        all(worker["index"] == worker[TRAINING_ITERATION] - 1
            for worker in element) for element in log)
    assert all(
        all(
            all(key in worker for key in BASIC_AUTOFILLED_KEYS)
            for worker in element) for element in log)
    if detailed:
        assert all(
            all(
                all(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)
    else:
        assert all(
            all(not any(key in worker for key in DETAILED_AUTOFILLED_KEYS)
                for worker in element) for element in log)

    os.environ.pop(ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)
    assert ENABLE_DETAILED_AUTOFILLED_METRICS_ENV not in os.environ

Example #3

Show file

    # TODO: TBXLoggerCallback should create nonexistent logdir
    #       and should also create 1 directory per file.
    tbx_logdir = "./runs"
    os.makedirs(tbx_logdir, exist_ok=True)
    callbacks = [
        TBXLoggerCallback(logdir=tbx_logdir),
        MLflowLoggerCallback(experiment_name="cuj-big-data-training",
                             save_artifact=True),
    ]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = Trainer(
        backend="torch",
        num_workers=num_workers,
        use_gpu=use_gpu,
        resources_per_worker=resources_per_worker,
    )
    trainer.start()
    results = trainer.run(train_func=train_func,
                          config=config,
                          callbacks=callbacks,
                          dataset=datasets)
    model = results[0]
    trainer.shutdown()

    if args.mlflow_register_model:
        mlflow.pytorch.log_model(model,
                                 artifact_path="models",
                                 registered_model_name="torch_model")

Example #4

Show file

def test_torch_auto_gpu_to_cpu(ray_start_4_cpus_2_gpus):
    """Tests if GPU tensors are auto converted to CPU on driver."""

    # Disable GPU on the driver.
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

    num_workers = 2

    class ValidateCPUCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                model = result["model"]
                assert not next(model.parameters()).is_cuda

    def train_func():
        model = torch.nn.Linear(1, 1)

        # Move to GPU device.
        model = ray.train.torch.prepare_model(model)

        assert next(model.parameters()).is_cuda

        ray.train.save_checkpoint(model=model)
        ray.train.report(model=model)

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=True)
    trainer.start()
    trainer.run(train_func, callbacks=[ValidateCPUCallback()])
    model = trainer.latest_checkpoint["model"]
    assert not next(model.parameters()).is_cuda
    trainer.shutdown()

    # Test the same thing for state dict.

    class ValidateCPUStateDictCallback(TrainingCallback):
        def handle_result(self, results, **info):
            for result in results:
                state_dict = result["state_dict"]
                for tensor in state_dict.values():
                    assert not tensor.is_cuda

    def train_func():
        model = torch.nn.Linear(1, 1)

        # Move to GPU device.
        model = ray.train.torch.prepare_model(model)

        assert next(model.parameters()).is_cuda

        state_dict = model.state_dict()

        for tensor in state_dict.values():
            assert tensor.is_cuda

        ray.train.save_checkpoint(state_dict=state_dict)
        ray.train.report(state_dict=state_dict)

    trainer = Trainer("torch", num_workers=num_workers, use_gpu=True)
    trainer.start()
    trainer.run(train_func, callbacks=[ValidateCPUStateDictCallback()])

    state_dict = trainer.latest_checkpoint["state_dict"]
    for tensor in state_dict.values():
        assert not tensor.is_cuda
    trainer.shutdown()

    # Reset the env var.
    os.environ.pop("CUDA_VISIBLE_DEVICES")

Example #5

Show file

File: tune_cifar_pytorch_pbt_example.py Project: stjordanis/ray

        action="store_true",
        default=False,
        help="Finish quickly for testing.")
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="Enables GPU training")

    args, _ = parser.parse_known_args()
    if args.smoke_test:
        ray.init(num_cpus=4)
    else:
        ray.init(address=args.address)

    trainer = Trainer(
        "torch", num_workers=args.num_workers, use_gpu=args.use_gpu)
    Trainable = trainer.to_tune_trainable(train_func)
    pbt_scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        })

    reporter = CLIReporter()
    reporter.add_metric_column("loss", "loss")

Example #6

Show file

File: pytorch_pbt_failure.py Project: vishalbelsare/ray

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test",
    action="store_true",
    default=False,
    help="Finish quickly for training.",
)
args = parser.parse_args()

ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

trainer = Trainer(
    num_workers=num_training_workers,
    use_gpu=not args.smoke_test,
    backend=TorchConfig(backend="gloo"),
)
TorchTrainable = trainer.to_tune_trainable(train_func=train_func)

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="loss",
    mode="min",
    perturbation_interval=1,
    hyperparam_mutations={
        # distribution for resampling
        "lr": lambda: np.random.uniform(0.001, 1),
        # allow perturbations within this set of categorical values
        "momentum": [0.8, 0.9, 0.99],
    },

Example #7

Show file

File: torch_quick_start.py Project: RuofanKong/ray

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    for epoch in range(num_epochs):
        output = model(input)
        loss = loss_fn(output, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"epoch: {epoch}, loss: {loss.item()}")

# __torch_distributed_end__


if __name__ == "__main__":
    # __torch_single_run_begin__

    train_func()

    # __torch_single_run_end__

    # __torch_trainer_begin__

    from ray.train import Trainer

    trainer = Trainer(backend="torch", num_workers=4)
    trainer.start()
    results = trainer.run(train_func_distributed)
    trainer.shutdown()

    # __torch_trainer_end__

Example #8

Show file