Example #1
0
def test_fail_twice(ray_start_2_cpus):  # noqa: F811
    if not dist.is_available():
        return

    def single_loader(config):
        return LinearDataset(2, 5, size=1000000)

    def step_with_fail(self, *args, **kwargs):
        worker_stats = [
            w.train_epoch.remote(*args, **kwargs) for w in self.workers
        ]
        if self._num_failures < 2:
            time.sleep(1)
            self.workers[0].__ray_kill__()
        success = check_for_failure(worker_stats)
        return success, worker_stats

    with patch.object(PyTorchTrainer, "_train_epoch", step_with_fail):
        trainer1 = PyTorchTrainer(model_creator,
                                  single_loader,
                                  optimizer_creator,
                                  batch_size=100000,
                                  loss_creator=lambda config: nn.MSELoss(),
                                  num_replicas=2)

        trainer1.train(max_retries=2)
Example #2
0
def test_single_step(ray_start_2_cpus):  # noqa: F811
    trainer = PyTorchTrainer(model_creator,
                             data_creator,
                             optimizer_creator,
                             loss_creator=lambda config: nn.MSELoss(),
                             num_replicas=1)
    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate(num_steps=1)
    assert val_metrics[BATCH_COUNT] == 1
Example #3
0
def train_example(num_replicas=1,
                  num_epochs=5,
                  use_gpu=False,
                  use_fp16=False,
                  test_mode=False):
    trainer1 = PyTorchTrainer(
        ResNet18,
        cifar_creator,
        optimizer_creator,
        nn.CrossEntropyLoss,
        scheduler_creator=scheduler_creator,
        initialization_hook=initialization_hook,
        num_replicas=num_replicas,
        config={
            "lr": 0.01,
            "test_mode": test_mode
        },
        use_gpu=use_gpu,
        batch_size=16 if test_mode else 512,
        backend="nccl" if use_gpu else "gloo",
        scheduler_step_freq="epoch",
        use_fp16=use_fp16)
    for i in range(num_epochs):
        # Increase `max_retries` to turn on fault tolerance.
        stats = trainer1.train(max_retries=0)
        print(stats)

    print(trainer1.validate())
    trainer1.shutdown()
    print("success!")
Example #4
0
def test_multi_model_matrix(ray_start_2_cpus, num_replicas):  # noqa: F811
    def train_epoch(self, iterator, info):
        if self.config.get("models", 1) > 1:
            assert len(self.models) == self.config["models"], self.config

        if self.config.get("optimizers", 1) > 1:
            assert len(
                self.optimizers) == self.config["optimizers"], self.config

        if self.config.get("schedulers", 1) > 1:
            assert len(
                self.schedulers) == self.config["schedulers"], self.config
        return {"done": 1}

    def multi_model_creator(config):
        models = []
        for i in range(config.get("models", 1)):
            models += [nn.Linear(1, 1)]
        return models[0] if len(models) == 1 else models

    def multi_optimizer_creator(models, config):
        optimizers = []
        main_model = models[0] if type(models) is list else models
        for i in range(config.get("optimizers", 1)):
            optimizers += [torch.optim.SGD(main_model.parameters(), lr=0.0001)]
        return optimizers[0] if len(optimizers) == 1 else optimizers

    def multi_scheduler_creator(optimizer, config):
        schedulers = []
        main_opt = optimizer[0] if type(optimizer) is list else optimizer
        for i in range(config.get("schedulers", 1)):
            schedulers += [
                torch.optim.lr_scheduler.StepLR(main_opt,
                                                step_size=30,
                                                gamma=0.1)
            ]
        return schedulers[0] if len(schedulers) == 1 else schedulers

    for model_count in range(1, 3):
        for optimizer_count in range(1, 3):
            for scheduler_count in range(1, 3):
                trainer = PyTorchTrainer(
                    multi_model_creator,
                    data_creator,
                    multi_optimizer_creator,
                    loss_creator=nn.MSELoss,
                    scheduler_creator=multi_scheduler_creator,
                    training_operator_cls=_TestingOperator,
                    num_replicas=num_replicas,
                    config={
                        "models": model_count,
                        "optimizers": optimizer_count,
                        "schedulers": scheduler_count,
                        "custom_func": train_epoch
                    })
                trainer.train()
                trainer.shutdown()
Example #5
0
def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    from torch.optim.lr_scheduler import ReduceLROnPlateau

    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer),
        training_operator_cls=_TestingOperator)
    trainer.update_scheduler(0.5)
    trainer.update_scheduler(0.5)
    assert all(
        trainer.apply_all_operators(
            lambda op: op.schedulers[0].last_epoch == 2))
    trainer.shutdown()
Example #6
0
def test_train(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer = PyTorchTrainer(model_creator,
                             data_creator,
                             optimizer_creator,
                             loss_creator=lambda config: nn.MSELoss(),
                             num_replicas=num_replicas)
    for i in range(3):
        train_loss1 = trainer.train()["mean_train_loss"]
    validation_loss1 = trainer.validate()["mean_validation_loss"]

    for i in range(3):
        train_loss2 = trainer.train()["mean_train_loss"]
    validation_loss2 = trainer.validate()["mean_validation_loss"]

    assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
    assert validation_loss2 <= validation_loss1, (validation_loss2,
                                                  validation_loss1)
Example #7
0
def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
    def train_epoch(self, iterator, info):
        assert info[SCHEDULER_STEP] == scheduler_freq
        return {"done": 1}

    def scheduler_creator(optimizer, config):
        return torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=30,
                                               gamma=0.1)

    trainer = PyTorchTrainer(model_creator,
                             data_creator,
                             optimizer_creator,
                             loss_creator=lambda config: nn.MSELoss(),
                             config={"custom_func": train_epoch},
                             training_operator_cls=_TestingOperator,
                             scheduler_creator=scheduler_creator,
                             scheduler_step_freq=scheduler_freq)

    for i in range(3):
        trainer.train()
    trainer.shutdown()
Example #8
0
def test_multi_model(ray_start_2_cpus, num_replicas):
    def train(*, model=None, criterion=None, optimizer=None, dataloader=None):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
        return {
            "accuracy": correct / total,
            "train_loss": train_loss / (batch_idx + 1)
        }

    def train_epoch(self, iterator, info):
        result = {}
        for i, (model,
                optimizer) in enumerate(zip(self.models, self.optimizers)):
            result["model_{}".format(i)] = train(model=model,
                                                 criterion=self.criterion,
                                                 optimizer=optimizer,
                                                 dataloader=iterator)
        return result

    def multi_model_creator(config):
        return nn.Linear(1, 1), nn.Linear(1, 1)

    def multi_optimizer_creator(models, config):
        opts = [
            torch.optim.SGD(model.parameters(), lr=0.0001) for model in models
        ]
        return opts[0], opts[1]

    trainer1 = PyTorchTrainer(multi_model_creator,
                              data_creator,
                              multi_optimizer_creator,
                              loss_creator=lambda config: nn.MSELoss(),
                              config={"custom_func": train_epoch},
                              training_operator_cls=_TestingOperator,
                              num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    models1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(multi_model_creator,
                              data_creator,
                              multi_optimizer_creator,
                              loss_creator=lambda config: nn.MSELoss(),
                              config={"custom_func": train_epoch},
                              training_operator_cls=_TestingOperator,
                              num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    models2 = trainer2.get_model()

    for model_1, model_2 in zip(models1, models2):

        model1_state_dict = model_1.state_dict()
        model2_state_dict = model_2.state_dict()

        assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

        for k in model1_state_dict:
            assert torch.equal(model1_state_dict[k], model2_state_dict[k])

    trainer2.shutdown()
Example #9
0
def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer1 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              loss_creator=lambda config: nn.MSELoss(),
                              num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    model1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              loss_creator=lambda config: nn.MSELoss(),
                              num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    model2 = trainer2.get_model()

    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()

    assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

    for k in model1_state_dict:
        assert torch.equal(model1_state_dict[k], model2_state_dict[k])