Example #1
0
def train_example(num_workers=1, use_gpu=False):
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, scheduler_creator=scheduler_creator,
        loss_creator=nn.MSELoss)
    trainer1 = TorchTrainer(
        training_operator_cls=CustomTrainingOperator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    m = trainer1.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Example #2
0
def main():
    setup_default_logging()

    args, args_text = parse_args()
    if args.smoke_test:
        ray.init(num_cpus=int(args.ray_num_workers))
    else:
        ray.init(address=args.ray_address)

    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        loss_creator=loss_creator)
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           use_tqdm=True,
                           use_fp16=args.amp,
                           apex_args={"opt_level": "O1"},
                           config={
                               "args": args,
                               BATCH_SIZE: args.batch_size
                           },
                           num_workers=args.ray_num_workers)

    if args.smoke_test:
        args.epochs = 1

    pbar = trange(args.epochs, unit="epoch")
    for i in pbar:
        trainer.train(num_steps=1 if args.smoke_test else None)

        val_stats = trainer.validate(num_steps=1 if args.smoke_test else None)
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

    trainer.shutdown()
Example #3
0
    return train_loader, validation_loader


def optimizer_creator(model, config):
    """Returns optimizer"""
    return torch.optim.SGD(model.parameters(),
                           lr=config.get("lr", 0.1),
                           momentum=config.get("momentum", 0.9))


ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

CustomTrainingOperator = TrainingOperator.from_creators(
    model_creator=ResNet18,
    optimizer_creator=optimizer_creator,
    data_creator=cifar_creator,
    loss_creator=nn.CrossEntropyLoss)

TorchTrainable = TorchTrainer.as_trainable(
    training_operator_cls=CustomTrainingOperator,
    initialization_hook=initialization_hook,
    num_workers=num_training_workers,
    config={
        "test_mode": args.smoke_test,
        BATCH_SIZE: 128 * num_training_workers,
    },
    use_gpu=not args.smoke_test,
    backend="gloo",  # This should also work with NCCL
)
Example #4
0
def get_custom_training_operator(lr_reduce_on_plateau=False):
    return TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator if lr_reduce_on_plateau
        else None)
Example #5
0
    return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# __torch_scheduler_end__

# __torch_ray_start__
import ray

ray.init()
# or ray.init(address="auto") to connect to a running cluster.
# __torch_ray_end__

# __backwards_compat_start__
from ray.util.sgd import TorchTrainer

MyTrainingOperator = TrainingOperator.from_creators(
    model_creator=model_creator, optimizer_creator=optimizer_creator,
    loss_creator=loss_creator, scheduler_creator=scheduler_creator,
    data_creator=data_creator)

trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler_creator is passed in
    config={"lr": 0.001, "batch_size": 64})

# __backwards_compat_end__

trainer.shutdown()

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
Example #6
0
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="Enables GPU training")
    parser.add_argument(
        "--lr-reduce-on-plateau",
        action="store_true",
        default=False,
        help="If enabled, use a ReduceLROnPlateau scheduler. If not set, "
             "no scheduler is used."
    )

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=3)
    else:
        ray.init(address=args.address)
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator if args.lr_reduce_on_plateau
        else None)
    if not args.lr_reduce_on_plateau:
        tune_example(CustomTrainingOperator, num_workers=args.num_workers,
                     use_gpu=args.use_gpu)
    else:
        tune_example_manual(CustomTrainingOperator,
                            num_workers=args.num_workers, use_gpu=args.use_gpu)
Example #7
0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    parser.add_argument(
        "--address",
        type=str,
        help="the address to use for Ray")
    parser.add_argument(
        "--num-workers",
        "-n",
        type=int,
        default=1,
        help="Sets number of workers for training.")
    parser.add_argument(
        "--use-gpu",
        action="store_true",
        default=False,
        help="Enables GPU training")

    args, _ = parser.parse_known_args()

    if args.smoke_test:
        ray.init(num_cpus=2)
    else:
        ray.init(address=args.address)
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator, optimizer_creator=optimizer_creator,
        data_creator=data_creator, loss_creator=nn.MSELoss)
    tune_example(CustomTrainingOperator, num_workers=args.num_workers,
                 use_gpu=args.use_gpu)
Example #8
0
                              batch_size=config["batch"])
    validation_loader = DataLoader(CIFAR10(root="~/data",
                                           download=True,
                                           transform=tfms),
                                   batch_size=config["batch"])
    return train_loader, validation_loader


def optimizer_creator(model, config):
    """Returns an optimizer (or multiple)"""
    return torch.optim.SGD(model.parameters(), lr=config["lr"])


CustomTrainingOperator = TrainingOperator.from_creators(
    model_creator=ResNet18,  # A function that returns a nn.Module
    optimizer_creator=optimizer_creator,  # A function that returns an optimizer
    data_creator=cifar_creator,  # A function that returns dataloaders
    loss_creator=torch.nn.CrossEntropyLoss  # A loss function
)

ray.init()

trainer = TorchTrainer(
    training_operator_cls=CustomTrainingOperator,
    config={
        "lr": 0.01,  # used in optimizer_creator
        "batch": 64  # used in data_creator
    },
    num_workers=2,  # amount of parallelism
    use_gpu=torch.cuda.is_available(),
    use_tqdm=True)