Esempio n. 1
0
def test_restore_checkpoint(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2)
    trainer = trainable_cls(config={"epochs": 1})
    trainer.train()
    checkpoint_dir = trainer.save()
    trainer.restore(checkpoint_dir)
    trainer.stop()
Esempio n. 2
0
def test_simple_tune(ray_start_4_cpus, enabled_checkpoint):
    trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2)
    analysis = tune.run(trainable_cls,
                        config={"enable_checkpoint": enabled_checkpoint},
                        num_samples=2,
                        stop={"training_iteration": 2})
    assert analysis.trials[0].last_result["training_iteration"] == 2
    assert analysis.trials[0].has_checkpoint() == enabled_checkpoint
Esempio n. 3
0
def test_save_checkpoint(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2)
    trainer = trainable_cls(config={"epochs": 1})
    trainer.train()
    checkpoint_dir = trainer.save()
    model_state_dict, opt_state_dict = torch.load(
        os.path.join(checkpoint_dir, "checkpoint"))
    trainer.stop()
Esempio n. 4
0
def test_colocated(ray_4_node):  # noqa: F811
    assert ray.available_resources()["CPU"] == 4
    trainable_cls = DistributedTrainableCreator(_train_check_global,
                                                num_workers=4,
                                                num_workers_per_host=1)
    trainable = trainable_cls()
    assert ray.available_resources().get("CPU", 0) == 0
    trainable.train()
    trainable.stop()
Esempio n. 5
0
def test_colocated_gpu_double(ray_4_node_gpu):  # noqa: F811
    assert ray.available_resources()["GPU"] == 8
    trainable_cls = DistributedTrainableCreator(_train_check_global,
                                                num_workers=8,
                                                num_gpus_per_worker=1,
                                                num_workers_per_host=2,
                                                timeout_s=30)
    trainable = trainable_cls()
    print("?????")
    print(ray.available_resources().get("GPU"))
    assert ray.available_resources().get("GPU", 0) == 0
    trainable.train()
    trainable.stop()
Esempio n. 6
0
def run_ddp_tune(num_workers, num_gpus_per_worker, workers_per_node=None):
    trainable_cls = DistributedTrainableCreator(
        train_mnist,
        num_workers=num_workers,
        num_gpus_per_worker=num_gpus_per_worker,
        num_workers_per_host=workers_per_node)

    analysis = tune.run(trainable_cls,
                        num_samples=4,
                        stop={"training_iteration": 10},
                        metric="mean_accuracy",
                        mode="max")

    print("Best hyperparameters found were: ", analysis.best_config)
Esempio n. 7
0
                        help="enables CUDA training")
    parser.add_argument("--cluster",
                        action="store_true",
                        default=False,
                        help="enables multi-node tuning")
    parser.add_argument(
        "--workers-per-node",
        type=int,
        help="Forces workers to be colocated on machines if set.")

    args = parser.parse_args()

    if args.cluster:
        options = dict(address="auto")
    else:
        options = dict(num_cpus=2)
    ray.init(**options)
    trainable_cls = DistributedTrainableCreator(
        train_mnist,
        num_workers=args.num_workers,
        num_gpus_per_worker=args.num_gpus_per_worker,
        num_workers_per_host=args.workers_per_node)

    analysis = tune.run(trainable_cls,
                        num_samples=4,
                        stop={"training_iteration": 10},
                        metric="mean_accuracy",
                        mode="max")

    print("Best hyperparameters found were: ", analysis.best_config)
Esempio n. 8
0
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    model = DistributedDataParallel(model)

    for epoch in range(400):
        train(model, optimizer, train_loader, device)
        acc = test(model, test_loader, device)

        if epoch % 3 == 0:
            with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(mean_accuracy=acc)


if __name__ == "__main__":
    training_iter = 10
    n_samples = 4

    if os.environ.get("ANYSCALE_HOST", None):
        print("Running on Anyscale")
        training_iter = 20
        n_samples = 10

    ray.init()
    trainable_cls = DistributedTrainableCreator(train_mnist)
    tune.run(trainable_cls,
             num_samples=n_samples,
             stop={"training_iteration": training_iter})
Esempio n. 9
0
                        "-n",
                        type=int,
                        default=2,
                        help="Sets number of workers for training.")
    parser.add_argument("--use-gpu",
                        action="store_true",
                        default=False,
                        help="enables CUDA training")
    parser.add_argument("--cluster",
                        action="store_true",
                        default=False,
                        help="enables multi-node tuning")

    args = parser.parse_args()

    if args.cluster:
        options = dict(address="auto")
    else:
        options = dict(num_cpus=2)
    ray.init(**options)
    trainable_cls = DistributedTrainableCreator(train_mnist,
                                                num_workers=args.num_workers,
                                                use_gpu=args.use_gpu)
    analysis = tune.run(trainable_cls,
                        num_samples=4,
                        stop={"training_iteration": 10},
                        metric="mean_accuracy",
                        mode="max")

    print("Best hyperparameters found were: ", analysis.best_config)
Esempio n. 10
0
def test_validation(ray_start_2_cpus):  # noqa: F811
    def bad_func(a, b, c):
        return 1

    with pytest.raises(ValueError):
        DistributedTrainableCreator(bad_func, num_workers=2)
Esempio n. 11
0
def test_set_global(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(
        _train_check_global, num_workers=2)
    trainable = trainable_cls()
    result = trainable.train()
    assert result["is_distributed"]
Esempio n. 12
0
def test_single_step(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2)
    trainer = trainable_cls()
    trainer.train()
    trainer.stop()
Esempio n. 13
0
def test_step_after_completion(ray_start_2_cpus):  # noqa: F811
    trainable_cls = DistributedTrainableCreator(_train_simple, num_workers=2)
    trainer = trainable_cls(config={"epochs": 1})
    with pytest.raises(RuntimeError):
        for i in range(10):
            trainer.train()
Esempio n. 14
0
def test_validate_session(ray_start_2_cpus):
    trainable_cls = DistributedTrainableCreator(_train_validate_session)
    tune.run(trainable_cls)
Esempio n. 15
0
    loss = torch.Tensor([0.0])
    for epoch in adl.remaining_epochs_until(config.get("epochs", 10)):
        for (x, y) in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

        tune.report(mean_loss=loss.item())


ray.init(address="auto", _tracing_startup_hook=None)

trainable_cls = DistributedTrainableCreator(_train_simple)

config_0 = {"epochs": 50}
config_1 = {
    "epochs": 60,
    "H": tune.choice([8, 12]),
    "N": tune.grid_search(list(range(32, 64, 8)))
}

analysis = tune.run(
    trainable_cls,
    num_samples=1,  # total trials will be num_samples x points on the grid
    scheduler=AdaptDLScheduler(),
    config=config_1,
    metric="mean_loss",
    mode="min")
Esempio n. 16
0
            with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(mean_accuracy=acc)


if __name__ == "__main__":

    training_iter = 50
    n_samples = 30

    # the following line run from anyscale project
    # this does not appear to work with incoming connections on anysacle, just like locally
    #ray.init()
    # the following runs on an existing ray on laptop
    # it also has worked when running pushed code to the anyscale session
    ray.init(address='auto', _redis_password='******')
    # the following is for anyscale connect verification
    # issues with current nightly are preventing its working atm
    #import anyscale
    #anyscale\
    #.require("./requirements.txt")\
    #.session(session_name="new-sbox")\
    #.connect()
    print("Starting Training")

    trainable_cls = DistributedTrainableCreator(train_mnist, num_workers=2)
    tune.run(trainable_cls,
             num_samples=n_samples,
             stop={"training_iteration": training_iter})
Esempio n. 17
0
        acc = test(model, test_loader, device)

        if epoch % 3 == 0:
            with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(mean_accuracy=acc)


if __name__ == "__main__":
    training_iter = 10
    n_samples = 4
    n_workers = 0
    if os.environ.get("ANYSCALE_HOST", None):
        training_iter = 20
        n_samples = 10
        n_workers = 1

    ray.init()
    gpu_resources = 0
    if "GPU" in ray.available_resources():
        print("GPU ENABLED")
        gpu_resources = 0.1

    trainable_cls = DistributedTrainableCreator(train_mnist, n_workers, 1,
                                                gpu_resources)
    tune.run(
        trainable_cls,
        num_samples=n_samples,
        stop={"training_iteration": training_iter},
    )
import ray
from ray import tune
from ray.tune.integration.torch import DistributedTrainableCreator
from ray.util.sgd.torch import is_distributed_trainable

ray.init()


def my_trainable(config, checkpoint_dir=None):
    if is_distributed_trainable():
        pass


trainable = DistributedTrainableCreator(
    my_trainable,
    use_gpu=True,
    num_workers=4,
    num_cpus_per_worker=1,
)

config = {}

tune.run(
    trainable,
    resources_per_trial=None,
    config=config,
)