Example #1
0
def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    from torch.optim.lr_scheduler import ReduceLROnPlateau

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        data_creator,
        scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer),
        loss_creator=lambda config: nn.MSELoss())
    TestOperator = get_test_operator(TestOperator)
    trainer = TorchTrainer(scheduler_step_freq="manual",
                           training_operator_cls=TestOperator)
    trainer.update_scheduler(0.5)
    trainer.update_scheduler(0.5)
    assert all(
        trainer.apply_all_operators(
            lambda op: op._schedulers[0].last_epoch == 2))
    trainer.shutdown()
Example #2
0
def test_resize(ray_start_2_cpus, use_local):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    start_with_fail = gen_start_with_fail(1)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())
    with patch.object(TorchTrainer, "_start_workers", start_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                use_local=use_local,
                                num_workers=2)

        @ray.remote(num_cpus=1)
        class DummyActor:
            def get(self):
                return 1

        dummy_handler = DummyActor.remote()
        trainer1.train(max_retries=1)
        assert trainer1.worker_group.num_workers == 1
        assert trainer1._num_failures == 1

        ray.get(dummy_handler.get.remote())
        ray.kill(dummy_handler)
        time.sleep(1)
        # trigger scale up
        trainer1.train()
        assert trainer1.worker_group.num_workers == 2

        trainer1.shutdown(force=True)
Example #3
0
    def testMultiLoaders(self):
        def three_data_loader(config):
            return (
                LinearDataset(2, 5),
                LinearDataset(2, 5, size=400),
                LinearDataset(2, 5, size=400),
            )

        ThreeOperator = TrainingOperator.from_creators(
            model_creator,
            optimizer_creator,
            three_data_loader,
            loss_creator=loss_creator,
        )

        runner = TorchRunner(training_operator_cls=ThreeOperator)
        with self.assertRaises(ValueError):
            runner.setup_operator()

        runner2 = TorchRunner(training_operator_cls=ThreeOperator)
        with self.assertRaises(ValueError):
            runner2.setup_operator()
Example #4
0
def test_fail_twice(ray_start_2_cpus):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    step_with_fail = gen_step_with_fail(2)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())

    with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                num_workers=2)

        # MAX RETRIES SHOULD BE ON BY DEFAULT
        trainer1.train()
        trainer1.shutdown()
Example #5
0
def test_fail_with_recover(ray_start_2_cpus):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        dataset = LinearDataset(2, 5, size=1000000)
        return DataLoader(dataset, batch_size=config.get("batch_size", 32))

    step_with_fail = gen_step_with_fail(3)

    TestOperator = TrainingOperator.from_creators(
        model_creator,
        optimizer_creator,
        single_loader,
        loss_creator=lambda config: nn.MSELoss())
    with patch.object(TorchTrainer, "_train_epoch", step_with_fail):
        trainer1 = TorchTrainer(training_operator_cls=TestOperator,
                                config={"batch_size": 100000},
                                num_workers=2)

        with pytest.raises(RuntimeError):
            trainer1.train(max_retries=1)

        trainer1.shutdown(force=True)
Example #6
0
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import DataLoader

import ray
from ray.util.sgd.torch import TorchTrainer
from ray.util.sgd.torch.worker_group import RemoteWorkerGroup
from ray.util.sgd.torch.training_operator import TrainingOperator

from ray.util.sgd.torch.examples.train_example import (model_creator,
                                                       optimizer_creator,
                                                       data_creator,
                                                       LinearDataset)

Operator = TrainingOperator.from_creators(model_creator,
                                          optimizer_creator,
                                          data_creator,
                                          loss_creator=nn.MSELoss)


@pytest.fixture
def ray_start_2_cpus():
    address_info = ray.init(num_cpus=2)
    yield address_info
    # The code after the yield will run as teardown code.
    ray.shutdown()
    # Ensure that tests don't ALL fail
    if dist.is_initialized():
        dist.destroy_process_group()


@pytest.fixture
Example #7
0
def test_num_steps(ray_start_2_cpus, use_local):
    """Tests if num_steps continues training from the subsampled dataset."""

    def data_creator(config):
        train_dataset = [0] * 5 + [1] * 5
        val_dataset = [0] * 5 + [1] * 5
        return DataLoader(train_dataset, batch_size=config["batch_size"]), \
            DataLoader(val_dataset, batch_size=config["batch_size"])

    batch_size = 1
    Operator = TrainingOperator.from_creators(model_creator, optimizer_creator,
                                              data_creator)

    def train_func(self, iterator, info=None):
        total_sum = 0
        num_items = 0
        for e in iterator:
            total_sum += e
            num_items += 1
        return {"average": total_sum.item() / num_items}

    TestOperator = get_test_operator(Operator)
    trainer = TorchTrainer(
        training_operator_cls=TestOperator,
        num_workers=2,
        use_local=use_local,
        add_dist_sampler=False,
        config={
            "batch_size": batch_size,
            "custom_func": train_func
        })

    # If num_steps not passed, should do one full epoch.
    result = trainer.train()
    # Average of 5 0s and 5 1s
    assert result["average"] == 0.5
    assert result["epoch"] == 1
    val_result = trainer.validate()
    assert val_result["average"] == 0.5

    # Train again with num_steps.
    result = trainer.train(num_steps=5)
    # 5 zeros
    assert result["average"] == 0
    assert result["epoch"] == 2
    val_result = trainer.validate(num_steps=5)
    assert val_result["average"] == 0

    # Should continue where last train run left off.
    result = trainer.train(num_steps=3)
    # 3 ones.
    assert result["average"] == 1
    assert result["epoch"] == 2
    val_result = trainer.validate(num_steps=3)
    assert val_result["average"] == 1

    # Should continue from last train run, and cycle to beginning.
    result = trainer.train(num_steps=5)
    # 2 ones and 3 zeros.
    assert result["average"] == 0.4
    assert result["epoch"] == 3
    val_result = trainer.validate(num_steps=5)
    assert val_result["average"] == 0.4

    # Should continue, and since num_steps not passed in, just finishes epoch.
    result = trainer.train()
    # 2 zeros and 5 ones.
    assert result["average"] == 5 / 7
    assert result["epoch"] == 3
    val_result = trainer.validate()
    assert val_result["average"] == 5 / 7

    trainer.shutdown()
Example #8
0
 def setUp(self):
     self.Operator = TrainingOperator.from_creators(
         model_creator,
         optimizer_creator,
         create_dataloaders,
         loss_creator=loss_creator)