Beispiel #1
0
def test_save_and_restore(ray_start_2_cpus, num_replicas):  # noqa: F811
    trainer1 = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    model1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    model2 = trainer2.get_model()

    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()

    assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

    for k in model1_state_dict:
        assert torch.equal(model1_state_dict[k], model2_state_dict[k])
Beispiel #2
0
def train_example(num_replicas=1,
                  num_epochs=5,
                  use_gpu=False,
                  use_fp16=False,
                  test_mode=False):
    config = {TEST_MODE: test_mode}
    trainer1 = PyTorchTrainer(ResNet18,
                              cifar_creator,
                              optimizer_creator,
                              nn.CrossEntropyLoss,
                              scheduler_creator=scheduler_creator,
                              initialization_hook=initialization_hook,
                              num_replicas=num_replicas,
                              config=config,
                              use_gpu=use_gpu,
                              batch_size=16 if test_mode else 512,
                              backend="nccl" if use_gpu else "gloo",
                              scheduler_step_freq="epoch",
                              use_fp16=use_fp16)
    for i in range(num_epochs):
        # Increase `max_retries` to turn on fault tolerance.
        stats = trainer1.train(max_retries=0)
        print(stats)

    print(trainer1.validate())
    trainer1.shutdown()
    print("success!")
Beispiel #3
0
def test_multi_model(ray_start_2_cpus, num_replicas):  # noqa: F811
    def custom_train(config, models, dataloader, criterion, optimizers,
                     **kwargs):
        result = {}
        for i, (model, optimizer) in enumerate(zip(models, optimizers)):
            result["model_{}".format(i)] = train(config, model, dataloader,
                                                 criterion, optimizer)
        return result

    def multi_model_creator(config):
        return nn.Linear(1, 1), nn.Linear(1, 1)

    def multi_optimizer_creator(models, config):
        opts = [
            torch.optim.SGD(model.parameters(), lr=0.0001) for model in models
        ]
        return opts[0], opts[1]

    trainer1 = PyTorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        train_function=custom_train,
        num_replicas=num_replicas)
    trainer1.train()

    filename = os.path.join(tempfile.mkdtemp(), "checkpoint")
    trainer1.save(filename)

    models1 = trainer1.get_model()

    trainer1.shutdown()

    trainer2 = PyTorchTrainer(
        multi_model_creator,
        data_creator,
        multi_optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        num_replicas=num_replicas)
    trainer2.restore(filename)

    os.remove(filename)

    models2 = trainer2.get_model()

    for model_1, model_2 in zip(models1, models2):

        model1_state_dict = model_1.state_dict()
        model2_state_dict = model_2.state_dict()

        assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

        for k in model1_state_dict:
            assert torch.equal(model1_state_dict[k], model2_state_dict[k])

    trainer2.shutdown()
Beispiel #4
0
def train_example(num_replicas=1, use_gpu=False):
    trainer1 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              num_replicas=num_replicas,
                              use_gpu=use_gpu,
                              backend="gloo")
    trainer1.train()
    trainer1.shutdown()
    print("success!")
Beispiel #5
0
def test_multi_model_matrix(ray_start_2_cpus, num_replicas):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        if config.get("models", 1) > 1:
            assert len(model) == config["models"], config

        if config.get("optimizers", 1) > 1:
            assert len(optimizer) == config["optimizers"], config

        if config.get("schedulers", 1) > 1:
            assert len(scheduler) == config["schedulers"], config
        return {"done": 1}

    def multi_model_creator(config):
        models = []
        for i in range(config.get("models", 1)):
            models += [nn.Linear(1, 1)]
        return models[0] if len(models) == 1 else models

    def multi_optimizer_creator(models, config):
        optimizers = []
        main_model = models[0] if type(models) is list else models
        for i in range(config.get("optimizers", 1)):
            optimizers += [torch.optim.SGD(main_model.parameters(), lr=0.0001)]
        return optimizers[0] if len(optimizers) == 1 else optimizers

    def multi_scheduler_creator(optimizer, config):
        schedulers = []
        main_opt = optimizer[0] if type(optimizer) is list else optimizer
        for i in range(config.get("schedulers", 1)):
            schedulers += [
                torch.optim.lr_scheduler.StepLR(
                    main_opt, step_size=30, gamma=0.1)
            ]
        return schedulers[0] if len(schedulers) == 1 else schedulers

    for model_count in range(1, 3):
        for optimizer_count in range(1, 3):
            for scheduler_count in range(1, 3):
                trainer = PyTorchTrainer(
                    multi_model_creator,
                    data_creator,
                    multi_optimizer_creator,
                    loss_creator=nn.MSELoss,
                    scheduler_creator=multi_scheduler_creator,
                    train_function=custom_train,
                    num_replicas=num_replicas,
                    config={
                        "models": model_count,
                        "optimizers": optimizer_count,
                        "schedulers": scheduler_count
                    })
                trainer.train()
                trainer.shutdown()
Beispiel #6
0
def train_example(num_replicas=1, use_gpu=False):
    trainer1 = PyTorchTrainer(model_creator,
                              data_creator,
                              optimizer_creator,
                              num_replicas=num_replicas,
                              resources_per_replica=Resources(
                                  num_cpus=1,
                                  num_gpus=int(use_gpu),
                                  resources={}))
    trainer1.train()
    trainer1.shutdown()
Beispiel #7
0
def test_scheduler_validate(ray_start_2_cpus):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        return {"done": 1}

    from torch.optim.lr_scheduler import ReduceLROnPlateau

    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        scheduler_creator=lambda optimizer, cfg: ReduceLROnPlateau(optimizer))
    trainer.update_scheduler(0.5)
    trainer.update_scheduler(0.5)
    assert all(
        trainer.apply_all_workers(lambda r: r.schedulers[0].last_epoch == 2))
    trainer.shutdown()
Beispiel #8
0
def test_scheduler_freq(ray_start_2_cpus, scheduler_freq):  # noqa: F811
    def custom_train(config, model, dataloader, criterion, optimizer,
                     scheduler):
        assert config[SCHEDULER_STEP] == scheduler_freq
        return {"done": 1}

    def scheduler_creator(optimizer, config):
        return torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=30, gamma=0.1)

    trainer = PyTorchTrainer(
        model_creator,
        data_creator,
        optimizer_creator,
        loss_creator=lambda config: nn.MSELoss(),
        scheduler_creator=scheduler_creator)

    for i in range(3):
        trainer.train()["train_loss"]
    trainer.shutdown()
def train_example(num_replicas=1, use_gpu=False, test_mode=False):
    config = {"test_mode": test_mode}
    trainer1 = PyTorchTrainer(ResNet18,
                              cifar_creator,
                              optimizer_creator,
                              nn.CrossEntropyLoss,
                              initialization_hook=initialization_hook,
                              train_function=train,
                              validation_function=validate,
                              num_replicas=num_replicas,
                              config=config,
                              use_gpu=use_gpu,
                              batch_size=16 if test_mode else 512,
                              backend="nccl" if use_gpu else "gloo")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    trainer1.shutdown()
    print("success!")