def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def main(): setup_default_logging() args, args_text = parse_args() if args.smoke_test: ray.init(num_cpus=int(args.ray_num_workers)) else: ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=loss_creator) trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, use_tqdm=True, use_fp16=args.amp, apex_args={"opt_level": "O1"}, config={ "args": args, BATCH_SIZE: args.batch_size }, num_workers=args.ray_num_workers) if args.smoke_test: args.epochs = 1 pbar = trange(args.epochs, unit="epoch") for i in pbar: trainer.train(num_steps=1 if args.smoke_test else None) val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown()
return train_loader, validation_loader def optimizer_creator(model, config): """Returns optimizer""" return torch.optim.SGD(model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9)) ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 CustomTrainingOperator = TrainingOperator.from_creators( model_creator=ResNet18, optimizer_creator=optimizer_creator, data_creator=cifar_creator, loss_creator=nn.CrossEntropyLoss) TorchTrainable = TorchTrainer.as_trainable( training_operator_cls=CustomTrainingOperator, initialization_hook=initialization_hook, num_workers=num_training_workers, config={ "test_mode": args.smoke_test, BATCH_SIZE: 128 * num_training_workers, }, use_gpu=not args.smoke_test, backend="gloo", # This should also work with NCCL )
def get_custom_training_operator(lr_reduce_on_plateau=False): return TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator if lr_reduce_on_plateau else None)
return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) # __torch_scheduler_end__ # __torch_ray_start__ import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __backwards_compat_start__ from ray.util.sgd import TorchTrainer MyTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, data_creator=data_creator) trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler_creator is passed in config={"lr": 0.001, "batch_size": 64}) # __backwards_compat_end__ trainer.shutdown() # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer(
parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") parser.add_argument( "--lr-reduce-on-plateau", action="store_true", default=False, help="If enabled, use a ReduceLROnPlateau scheduler. If not set, " "no scheduler is used." ) args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=3) else: ray.init(address=args.address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator if args.lr_reduce_on_plateau else None) if not args.lr_reduce_on_plateau: tune_example(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu) else: tune_example_manual(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu)
parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--address", type=str, help="the address to use for Ray") parser.add_argument( "--num-workers", "-n", type=int, default=1, help="Sets number of workers for training.") parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=2) else: ray.init(address=args.address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss) tune_example(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu)
batch_size=config["batch"]) validation_loader = DataLoader(CIFAR10(root="~/data", download=True, transform=tfms), batch_size=config["batch"]) return train_loader, validation_loader def optimizer_creator(model, config): """Returns an optimizer (or multiple)""" return torch.optim.SGD(model.parameters(), lr=config["lr"]) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=ResNet18, # A function that returns a nn.Module optimizer_creator=optimizer_creator, # A function that returns an optimizer data_creator=cifar_creator, # A function that returns dataloaders loss_creator=torch.nn.CrossEntropyLoss # A loss function ) ray.init() trainer = TorchTrainer( training_operator_cls=CustomTrainingOperator, config={ "lr": 0.01, # used in optimizer_creator "batch": 64 # used in data_creator }, num_workers=2, # amount of parallelism use_gpu=torch.cuda.is_available(), use_tqdm=True)