def main(): setup_default_logging() args, args_text = parse_args() if args.smoke_test: ray.init(num_cpus=int(args.ray_num_workers)) else: ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=loss_creator) trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, use_tqdm=True, use_fp16=args.amp, apex_args={"opt_level": "O1"}, config={ "args": args, BATCH_SIZE: args.batch_size }, num_workers=args.ray_num_workers) if args.smoke_test: args.epochs = 1 pbar = trange(args.epochs, unit="epoch") for i in pbar: trainer.train(num_steps=1 if args.smoke_test else None) val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown()
def train_example(num_workers=1, use_gpu=False): trainer1 = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def test_save_and_restore(ray_start_2_cpus, num_workers, use_local, tmp_path): # noqa: F811 trainer1 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer1.train() checkpoint_path = os.path.join(tmp_path, "checkpoint") trainer1.save(checkpoint_path) model1 = trainer1.get_model() ints1 = trainer1.apply_all_operators(lambda op: op.get_model().rand_int)[0] trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer2.load(checkpoint_path) model2 = trainer2.get_model() ints2 = trainer2.apply_all_operators(lambda op: op.get_model().rand_int) model1_state_dict = model1.state_dict() model2_state_dict = model2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) for i in ints2: assert i == ints1 trainer2.shutdown()
def test_correctness(ray_start_2_cpus, num_workers, use_local): layer = nn.Linear(1, 1) ptl_op = TrainingOperator.from_ptl(PTL_Module) trainer1 = TorchTrainer(training_operator_cls=ptl_op, config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train1_stats = trainer1.train() val1_stats = trainer1.validate() trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator, scheduler_step_freq="manual", config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train2_stats = trainer2.train() val2_stats = trainer2.validate() trainer2.shutdown() assert train1_stats["train_loss"] == train2_stats["train_loss"] assert val1_stats["val_loss"] == val2_stats["val_loss"] assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) # If using Ray Client, make sure to force model onto CPU. import ray m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected()) print("trained weight: % .2f, bias: % .2f" % (m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def run(num_workers, use_gpu, num_epochs, lr, batch_size, n_hidden, n_layers, n_heads, fan_out, feat_drop, attn_drop, negative_slope, sampling_num_workers): trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, backend="nccl", config={ "lr": lr, "batch_size": batch_size, "n_hidden": n_hidden, "n_layers": n_layers, "n_heads": n_heads, "fan_out": fan_out, "feat_drop": feat_drop, "attn_drop": attn_drop, "negative_slope": negative_slope, "sampling_num_workers": sampling_num_workers }) for i in range(num_epochs): trainer.train() validation_results = trainer.validate() trainer.shutdown() print(validation_results) print("success!")
def test_single_step(ray_start_2_cpus, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=1, use_local=use_local, use_gpu=False) metrics = trainer.train(num_steps=1) assert metrics[BATCH_COUNT] == 1 val_metrics = trainer.validate(num_steps=1) assert val_metrics[BATCH_COUNT] == 1 trainer.shutdown()
def test_train(ray_start_2_cpus, num_workers, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local, use_gpu=False) for i in range(3): train_loss1 = trainer.train()["train_loss"] validation_loss1 = trainer.validate()["val_loss"] for i in range(3): train_loss2 = trainer.train()["train_loss"] validation_loss2 = trainer.validate()["val_loss"] assert train_loss2 <= train_loss1, (train_loss2, train_loss1) assert validation_loss2 <= validation_loss1, (validation_loss2, validation_loss1) trainer.shutdown()
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): Operator = TrainingOperator.from_ptl(LitMNIST) trainer = TorchTrainer( training_operator_cls=Operator, num_workers=num_workers, config={"lr": 1e-3, "batch_size": 64}, use_gpu=use_gpu, use_tqdm=True, ) for i in range(num_epochs): stats = trainer.train() print(stats) print(trainer.validate()) print("Saving model checkpoint to ./model.pt") trainer.save("./model.pt") print("Model Checkpointed!") trainer.shutdown() print("success!")
# __torch_ray_end__ # __backwards_compat_start__ from ray.util.sgd import TorchTrainer MyTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, data_creator=data_creator) trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler_creator is passed in config={"lr": 0.001, "batch_size": 64}) # __backwards_compat_end__ trainer.shutdown() # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler is used config={"lr": 0.001, "batch_size": 64}) # __torch_trainer_end__ trainer.shutdown()
# Create optimizer. optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Create loss. loss = torch.nn.MSELoss() # Register model, optimizer, and loss. self.model, self.optimizer, self.criterion = self.register( models=model, optimizers=optimizer, criterion=loss) # Register data loaders. self.register_data(train_loader=train_loader, validation_loader=val_loader) ray.init(address='auto') trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=2, use_gpu=False, config={"batch_size": 64}) stats = trainer1.train() print(stats) trainer1.shutdown() print("success!")
def main(args): if args.smoke_test: ray.init(num_cpus=4) else: ray.init(address=args.address, num_cpus=args.num_workers, log_to_driver=True) # Trainer Initialization trainer = TorchTrainer(training_operator_cls=CIFAR10Module, num_workers=args.num_workers, config={ "lr": args.learning_rate, "lr_decay": args.lr_decay, "eps": args.eps, "momentum": args.momentum, "wd": args.wd, "data_dir": args.data_dir, "batch_size": args.batch_size, "num_workers": args.num_workers, "smoke_test": args.smoke_test }, use_gpu=args.use_gpu, scheduler_step_freq="epoch", use_fp16=args.fp16, use_tqdm=False) train_loss = [] val_loss = [] val_acc = [] path = os.path.join("/root/volume/Paper/MLVC_Internship", args.checkpoint_dir, args.model_name + "_" + str(args.trial)) if not os.path.exists(path): os.mkdir(path) from tabulate import tabulate pbar = trange(args.max_epochs, unit="epoch") for it in pbar: stats = trainer.train(max_retries=1, info=dict(epoch_idx=it, num_epochs=args.max_epochs)) train_loss.append(stats["train_loss"]) val_stats = trainer.validate() val_loss.append(val_stats["val_loss"]) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.save( "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray". format(args.model_name, args.trial, it)) torch.save( [train_loss, val_loss], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss" .format(args.model_name, args.trial, it)) torch.save( [val_acc], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc". format(args.model_name, args.trial, it)) print(val_stats) trainer.shutdown() print("success!")