Exemple #1
0
def test_correctness(ray_start_2_cpus, num_workers, use_local):
    layer = nn.Linear(1, 1)
    ptl_op = TrainingOperator.from_ptl(PTL_Module)
    trainer1 = TorchTrainer(training_operator_cls=ptl_op,
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train1_stats = trainer1.train()
    val1_stats = trainer1.validate()
    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator,
                            scheduler_step_freq="manual",
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train2_stats = trainer2.train()
    val2_stats = trainer2.validate()
    trainer2.shutdown()

    assert train1_stats["train_loss"] == train2_stats["train_loss"]
    assert val1_stats["val_loss"] == val2_stats["val_loss"]
    assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
def train_example(num_workers=1, use_gpu=False):
    trainer1 = TorchTrainer(
        model_creator=model_creator,
        data_creator=data_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    m = trainer1.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Exemple #3
0
def main():
    setup_default_logging()

    args, args_text = parse_args()
    if args.smoke_test:
        ray.init(num_cpus=int(args.ray_num_workers))
    else:
        ray.init(address=args.ray_address)

    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        loss_creator=loss_creator)
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           use_tqdm=True,
                           use_fp16=args.amp,
                           apex_args={"opt_level": "O1"},
                           config={
                               "args": args,
                               BATCH_SIZE: args.batch_size
                           },
                           num_workers=args.ray_num_workers)

    if args.smoke_test:
        args.epochs = 1

    pbar = trange(args.epochs, unit="epoch")
    for i in pbar:
        trainer.train(num_steps=1 if args.smoke_test else None)

        val_stats = trainer.validate(num_steps=1 if args.smoke_test else None)
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

    trainer.shutdown()
Exemple #4
0
def main(args):
    os.makedirs(args.output_dir, exist_ok=True)

    print(args)
    start_time = time.time()
    config = {"args": args, "num_workers": args.num_workers}
    trainer = TorchTrainer(training_operator_cls=SegOperator,
                           use_tqdm=True,
                           use_fp16=True,
                           num_workers=config["num_workers"],
                           config=config,
                           use_gpu=torch.cuda.is_available())

    for epoch in range(args.epochs):
        trainer.train()
        confmat = trainer.validate(reduce_results=False)[0]
        print(confmat)
        state_dict = trainer.state_dict()
        state_dict.update(epoch=epoch, args=args)
        torch.save(state_dict,
                   os.path.join(args.output_dir, f"model_{epoch}.pth"))

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print(f"Training time {total_time_str}")
Exemple #5
0
def train_example(num_workers=1, use_gpu=False):
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        scheduler_creator=scheduler_creator,
        loss_creator=nn.MSELoss)
    trainer1 = TorchTrainer(
        training_operator_cls=CustomTrainingOperator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())

    # If using Ray Client, make sure to force model onto CPU.
    import ray
    m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected())
    print("trained weight: % .2f, bias: % .2f" %
          (m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Exemple #6
0
def run(num_workers, use_gpu, num_epochs, lr, batch_size, n_hidden, n_layers,
        n_heads, fan_out, feat_drop, attn_drop, negative_slope,
        sampling_num_workers):
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           num_workers=num_workers,
                           use_gpu=use_gpu,
                           backend="nccl",
                           config={
                               "lr": lr,
                               "batch_size": batch_size,
                               "n_hidden": n_hidden,
                               "n_layers": n_layers,
                               "n_heads": n_heads,
                               "fan_out": fan_out,
                               "feat_drop": feat_drop,
                               "attn_drop": attn_drop,
                               "negative_slope": negative_slope,
                               "sampling_num_workers": sampling_num_workers
                           })

    for i in range(num_epochs):
        trainer.train()
    validation_results = trainer.validate()
    trainer.shutdown()
    print(validation_results)
    print("success!")
Exemple #7
0
def test_train(ray_start_2_cpus, num_workers, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=num_workers,
                           use_local=use_local,
                           use_gpu=False)
    for i in range(3):
        train_loss1 = trainer.train()["train_loss"]
    validation_loss1 = trainer.validate()["val_loss"]

    for i in range(3):
        train_loss2 = trainer.train()["train_loss"]
    validation_loss2 = trainer.validate()["val_loss"]

    assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
    assert validation_loss2 <= validation_loss1, (validation_loss2,
                                                  validation_loss1)
    trainer.shutdown()
Exemple #8
0
def test_single_step(ray_start_2_cpus, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=1,
                           use_local=use_local,
                           use_gpu=False)
    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate(num_steps=1)
    assert val_metrics[BATCH_COUNT] == 1
    trainer.shutdown()
Exemple #9
0
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5):
    Operator = TrainingOperator.from_ptl(LitMNIST)
    trainer = TorchTrainer(
        training_operator_cls=Operator,
        num_workers=num_workers,
        config={"lr": 1e-3, "batch_size": 64},
        use_gpu=use_gpu,
        use_tqdm=True,
    )
    for i in range(num_epochs):
        stats = trainer.train()
        print(stats)

    print(trainer.validate())
    print("Saving model checkpoint to ./model.pt")
    trainer.save("./model.pt")
    print("Model Checkpointed!")
    trainer.shutdown()
    print("success!")
Exemple #10
0
        # Register all of these components with Ray SGD.
        # This allows Ray SGD to do framework level setup like Cuda, DDP,
        # Distributed Sampling, FP16.
        # We also assign the return values of self.register to instance
        # attributes so we can access it in our custom training/validation
        # methods.
        self.model, self.optimizer, self.criterion, self.scheduler = \
            self.register(models=model, optimizers=optimizer,
                          criterion=criterion,
                          schedulers=scheduler)


# init ray or ray.init(address="auto") to connect to a running cluster.
ray.init()

# use TorchTrainer to package your model as a ray-object for moving around
trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler is used
    config={
        "lr": 0.001,
        "batch_size": 64
    },
    num_workers=100,
    use_gpu=True)

for i in range(10):
    metrics = trainer.train()
    val_metrics = trainer.validate()
Exemple #11
0
def main(args):
    if args.smoke_test:
        ray.init(num_cpus=4)
    else:
        ray.init(address=args.address,
                 num_cpus=args.num_workers,
                 log_to_driver=True)

    # Trainer Initialization
    trainer = TorchTrainer(training_operator_cls=CIFAR10Module,
                           num_workers=args.num_workers,
                           config={
                               "lr": args.learning_rate,
                               "lr_decay": args.lr_decay,
                               "eps": args.eps,
                               "momentum": args.momentum,
                               "wd": args.wd,
                               "data_dir": args.data_dir,
                               "batch_size": args.batch_size,
                               "num_workers": args.num_workers,
                               "smoke_test": args.smoke_test
                           },
                           use_gpu=args.use_gpu,
                           scheduler_step_freq="epoch",
                           use_fp16=args.fp16,
                           use_tqdm=False)

    train_loss = []
    val_loss = []
    val_acc = []

    path = os.path.join("/root/volume/Paper/MLVC_Internship",
                        args.checkpoint_dir,
                        args.model_name + "_" + str(args.trial))
    if not os.path.exists(path):
        os.mkdir(path)

    from tabulate import tabulate
    pbar = trange(args.max_epochs, unit="epoch")
    for it in pbar:
        stats = trainer.train(max_retries=1,
                              info=dict(epoch_idx=it,
                                        num_epochs=args.max_epochs))
        train_loss.append(stats["train_loss"])
        val_stats = trainer.validate()
        val_loss.append(val_stats["val_loss"])
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

        trainer.save(
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray".
            format(args.model_name, args.trial, it))
        torch.save(
            [train_loss, val_loss],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss"
            .format(args.model_name, args.trial, it))
        torch.save(
            [val_acc],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc".
            format(args.model_name, args.trial, it))

    print(val_stats)
    trainer.shutdown()
    print("success!")
Exemple #12
0
        val_loader = DataLoader(test_shard, batch_size=64)
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)


# You can either train the model like this

trainer = TorchTrainer(training_operator_cls=CustomOperator,
                       num_workers=num_executors,
                       add_dist_sampler=False,
                       num_cpus_per_worker=1,
                       config={"lr": 0.01})
for i in range(10):
    stats = trainer.train()
    print(stats)
    val_stats = trainer.validate()
    print(val_stats)
trainer.shutdown()

# Or you can perform a hyperparameter search using Ray Tune

# TorchTrainable = TorchTrainer.as_trainable(
#                     training_operator_cls=CustomOperator,
#                     num_workers=num_executors,
#                     add_dist_sampler=False,
#                     use_gpu=False,
#                     num_cpus_per_worker=1
#                  )
# analysis = tune.run(
#                 TorchTrainable,
#                 config={"lr": tune.grid_search([0.01, 0.1])},
Exemple #13
0
        training_operator_cls=SafetyTrainingOperator,
        num_workers=1,
        use_gpu=True,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 1024,  # used in data_creator
            "path": path1,  # path to load the agent nn
        },
        backend="auto",
        scheduler_step_freq="epoch")
    for i in range(100):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    torch.save(trainer1.state_dict(), "checkpoint.pt")
    torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt")
    m = trainer1.get_model()
    print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})")
    # trainer1.shutdown()
    print("success!")
else:
    m = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh())
    checkpoint = torch.load("invariant_checkpoint.pt", torch.device("cpu"))
    m.load_state_dict(checkpoint)
    # trained weight:  [[0.0018693  0.05228069]], bias: [-0.5533147] , train_loss = 0.0
    # trained weight:  [[-0.01369903  0.03511396]], bias: [-0.6535952] , train_loss = 0.0
    # trained weight:  [[0.00687088  0.26634103]], bias: [-0.6658108] , train_loss = 0.0
    # trained weight: torch.tensor([[0.038166143000125885,0.16197167336940765]]), bias: torch.tensor([-2.3122551])
# %%