Example #1
0
def train_example(num_workers=1, use_gpu=False):
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, scheduler_creator=scheduler_creator,
        loss_creator=nn.MSELoss)
    trainer1 = TorchTrainer(
        training_operator_cls=CustomTrainingOperator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    m = trainer1.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Example #2
0
def main():
    setup_default_logging()

    args, args_text = parse_args()
    if args.smoke_test:
        ray.init(num_cpus=int(args.ray_num_workers))
    else:
        ray.init(address=args.ray_address)

    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        loss_creator=loss_creator)
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           use_tqdm=True,
                           use_fp16=args.amp,
                           apex_args={"opt_level": "O1"},
                           config={
                               "args": args,
                               BATCH_SIZE: args.batch_size
                           },
                           num_workers=args.ray_num_workers)

    if args.smoke_test:
        args.epochs = 1

    pbar = trange(args.epochs, unit="epoch")
    for i in pbar:
        trainer.train(num_steps=1 if args.smoke_test else None)

        val_stats = trainer.validate(num_steps=1 if args.smoke_test else None)
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

    trainer.shutdown()
Example #3
0
def test_correctness(ray_start_2_cpus, num_workers, use_local):
    layer = nn.Linear(1, 1)
    ptl_op = TrainingOperator.from_ptl(PTL_Module)
    trainer1 = TorchTrainer(training_operator_cls=ptl_op,
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train1_stats = trainer1.train()
    val1_stats = trainer1.validate()
    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator,
                            scheduler_step_freq="manual",
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train2_stats = trainer2.train()
    val2_stats = trainer2.validate()
    trainer2.shutdown()

    assert train1_stats["train_loss"] == train2_stats["train_loss"]
    assert val1_stats["val_loss"] == val2_stats["val_loss"]
    assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
Example #4
0
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5):
    Operator = TrainingOperator.from_ptl(LitMNIST)
    trainer = TorchTrainer(
        training_operator_cls=Operator,
        num_workers=num_workers,
        config={"lr": 1e-3, "batch_size": 64},
        use_gpu=use_gpu,
        use_tqdm=True,
    )
    for i in range(num_epochs):
        stats = trainer.train()
        print(stats)

    print(trainer.validate())
    print("Saving model checkpoint to ./model.pt")
    trainer.save("./model.pt")
    print("Model Checkpointed!")
    trainer.shutdown()
    print("success!")
Example #5
0
    return train_loader, validation_loader


def optimizer_creator(model, config):
    """Returns optimizer"""
    return torch.optim.SGD(model.parameters(),
                           lr=config.get("lr", 0.1),
                           momentum=config.get("momentum", 0.9))


ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

CustomTrainingOperator = TrainingOperator.from_creators(
    model_creator=ResNet18,
    optimizer_creator=optimizer_creator,
    data_creator=cifar_creator,
    loss_creator=nn.CrossEntropyLoss)

TorchTrainable = TorchTrainer.as_trainable(
    training_operator_cls=CustomTrainingOperator,
    initialization_hook=initialization_hook,
    num_workers=num_training_workers,
    config={
        "test_mode": args.smoke_test,
        BATCH_SIZE: 128 * num_training_workers,
    },
    use_gpu=not args.smoke_test,
    backend="gloo",  # This should also work with NCCL
)
Example #6
0
def get_custom_training_operator(lr_reduce_on_plateau=False):
    return TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator if lr_reduce_on_plateau
        else None)
Example #7
0
    return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# __torch_scheduler_end__

# __torch_ray_start__
import ray

ray.init()
# or ray.init(address="auto") to connect to a running cluster.
# __torch_ray_end__

# __backwards_compat_start__
from ray.util.sgd import TorchTrainer

MyTrainingOperator = TrainingOperator.from_creators(
    model_creator=model_creator, optimizer_creator=optimizer_creator,
    loss_creator=loss_creator, scheduler_creator=scheduler_creator,
    data_creator=data_creator)

trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler_creator is passed in
    config={"lr": 0.001, "batch_size": 64})

# __backwards_compat_end__

trainer.shutdown()

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
Example #8
0
        self.loss = nn.MSELoss()

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.val_loader

    def on_save_checkpoint(self, checkpoint):
        checkpoint["int"] = self.rand_int

    def on_load_checkpoint(self, checkpoint):
        self.rand_int = checkpoint["int"]


Operator = TrainingOperator.from_ptl(PTL_Module)


@pytest.mark.parametrize("use_local", [True, False])
def test_single_step(ray_start_2_cpus, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=1,
                           use_local=use_local,
                           use_gpu=False)
    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate(num_steps=1)
    assert val_metrics[BATCH_COUNT] == 1
    trainer.shutdown()
Example #9
0
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="Enables GPU training")
    parser.add_argument(
        "--lr-reduce-on-plateau",
        action="store_true",
        default=False,
        help="If enabled, use a ReduceLROnPlateau scheduler. If not set, "
             "no scheduler is used."
    )

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=3)
    else:
        ray.init(address=args.address)
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator if args.lr_reduce_on_plateau
        else None)
    if not args.lr_reduce_on_plateau:
        tune_example(CustomTrainingOperator, num_workers=args.num_workers,
                     use_gpu=args.use_gpu)
    else:
        tune_example_manual(CustomTrainingOperator,
                            num_workers=args.num_workers, use_gpu=args.use_gpu)
Example #10
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    parser.add_argument(
        "--address",
        type=str,
        help="the address to use for Ray")
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=1,
        help="Sets number of workers for training.")
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="Enables GPU training")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=2)
    else:
        ray.init(address=args.address)
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss)
    tune_example(CustomTrainingOperator, num_workers=args.num_workers,
                 use_gpu=args.use_gpu)
Example #11
0
                              batch_size=config["batch"])
    validation_loader = DataLoader(CIFAR10(root="~/data",
                                           download=True,
                                           transform=tfms),
                                   batch_size=config["batch"])
    return train_loader, validation_loader


def optimizer_creator(model, config):
    """Returns an optimizer (or multiple)"""
    return torch.optim.SGD(model.parameters(), lr=config["lr"])


CustomTrainingOperator = TrainingOperator.from_creators(
    model_creator=ResNet18,  # A function that returns a nn.Module
    optimizer_creator=optimizer_creator,  # A function that returns an optimizer
    data_creator=cifar_creator,  # A function that returns dataloaders
    loss_creator=torch.nn.CrossEntropyLoss  # A loss function
)

ray.init()

trainer = TorchTrainer(
    training_operator_cls=CustomTrainingOperator,
    config={
        "lr": 0.01,  # used in optimizer_creator
        "batch": 64  # used in data_creator
    },
    num_workers=2,  # amount of parallelism
    use_gpu=torch.cuda.is_available(),
    use_tqdm=True)