Exemple #1
0
def main():
    setup_default_logging()

    args, args_text = parse_args()
    if args.smoke_test:
        ray.init(num_cpus=int(args.ray_num_workers))
    else:
        ray.init(address=args.ray_address)

    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        loss_creator=loss_creator)
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           use_tqdm=True,
                           use_fp16=args.amp,
                           apex_args={"opt_level": "O1"},
                           config={
                               "args": args,
                               BATCH_SIZE: args.batch_size
                           },
                           num_workers=args.ray_num_workers)

    if args.smoke_test:
        args.epochs = 1

    pbar = trange(args.epochs, unit="epoch")
    for i in pbar:
        trainer.train(num_steps=1 if args.smoke_test else None)

        val_stats = trainer.validate(num_steps=1 if args.smoke_test else None)
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

    trainer.shutdown()
def train_example(num_workers=1, use_gpu=False):
    trainer1 = TorchTrainer(
        model_creator=model_creator,
        data_creator=data_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.MSELoss,
        scheduler_creator=scheduler_creator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())
    m = trainer1.get_model()
    print("trained weight: % .2f, bias: % .2f" % (
        m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Exemple #3
0
def test_save_and_restore(ray_start_2_cpus, num_workers, use_local,
                          tmp_path):  # noqa: F811
    trainer1 = TorchTrainer(training_operator_cls=Operator,
                            num_workers=num_workers,
                            use_local=use_local)
    trainer1.train()
    checkpoint_path = os.path.join(tmp_path, "checkpoint")
    trainer1.save(checkpoint_path)

    model1 = trainer1.get_model()
    ints1 = trainer1.apply_all_operators(lambda op: op.get_model().rand_int)[0]

    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=Operator,
                            num_workers=num_workers,
                            use_local=use_local)
    trainer2.load(checkpoint_path)

    model2 = trainer2.get_model()
    ints2 = trainer2.apply_all_operators(lambda op: op.get_model().rand_int)

    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()

    assert set(model1_state_dict.keys()) == set(model2_state_dict.keys())

    for k in model1_state_dict:
        assert torch.equal(model1_state_dict[k], model2_state_dict[k])
    for i in ints2:
        assert i == ints1
    trainer2.shutdown()
Exemple #4
0
def test_correctness(ray_start_2_cpus, num_workers, use_local):
    layer = nn.Linear(1, 1)
    ptl_op = TrainingOperator.from_ptl(PTL_Module)
    trainer1 = TorchTrainer(training_operator_cls=ptl_op,
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train1_stats = trainer1.train()
    val1_stats = trainer1.validate()
    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator,
                            scheduler_step_freq="manual",
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train2_stats = trainer2.train()
    val2_stats = trainer2.validate()
    trainer2.shutdown()

    assert train1_stats["train_loss"] == train2_stats["train_loss"]
    assert val1_stats["val_loss"] == val2_stats["val_loss"]
    assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
Exemple #5
0
def train_example(num_workers=1, use_gpu=False):
    CustomTrainingOperator = TrainingOperator.from_creators(
        model_creator=model_creator,
        optimizer_creator=optimizer_creator,
        data_creator=data_creator,
        scheduler_creator=scheduler_creator,
        loss_creator=nn.MSELoss)
    trainer1 = TorchTrainer(
        training_operator_cls=CustomTrainingOperator,
        num_workers=num_workers,
        use_gpu=use_gpu,
        config={
            "lr": 1e-2,  # used in optimizer_creator
            "hidden_size": 1,  # used in model_creator
            "batch_size": 4,  # used in data_creator
        },
        backend="gloo",
        scheduler_step_freq="epoch")
    for i in range(5):
        stats = trainer1.train()
        print(stats)

    print(trainer1.validate())

    # If using Ray Client, make sure to force model onto CPU.
    import ray
    m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected())
    print("trained weight: % .2f, bias: % .2f" %
          (m.weight.item(), m.bias.item()))
    trainer1.shutdown()
    print("success!")
Exemple #6
0
def run(num_workers, use_gpu, num_epochs, lr, batch_size, n_hidden, n_layers,
        n_heads, fan_out, feat_drop, attn_drop, negative_slope,
        sampling_num_workers):
    trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator,
                           num_workers=num_workers,
                           use_gpu=use_gpu,
                           backend="nccl",
                           config={
                               "lr": lr,
                               "batch_size": batch_size,
                               "n_hidden": n_hidden,
                               "n_layers": n_layers,
                               "n_heads": n_heads,
                               "fan_out": fan_out,
                               "feat_drop": feat_drop,
                               "attn_drop": attn_drop,
                               "negative_slope": negative_slope,
                               "sampling_num_workers": sampling_num_workers
                           })

    for i in range(num_epochs):
        trainer.train()
    validation_results = trainer.validate()
    trainer.shutdown()
    print(validation_results)
    print("success!")
Exemple #7
0
def test_single_step(ray_start_2_cpus, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=1,
                           use_local=use_local,
                           use_gpu=False)
    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate(num_steps=1)
    assert val_metrics[BATCH_COUNT] == 1
    trainer.shutdown()
Exemple #8
0
def test_train(ray_start_2_cpus, num_workers, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=num_workers,
                           use_local=use_local,
                           use_gpu=False)
    for i in range(3):
        train_loss1 = trainer.train()["train_loss"]
    validation_loss1 = trainer.validate()["val_loss"]

    for i in range(3):
        train_loss2 = trainer.train()["train_loss"]
    validation_loss2 = trainer.validate()["val_loss"]

    assert train_loss2 <= train_loss1, (train_loss2, train_loss1)
    assert validation_loss2 <= validation_loss1, (validation_loss2,
                                                  validation_loss1)
    trainer.shutdown()
Exemple #9
0
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5):
    Operator = TrainingOperator.from_ptl(LitMNIST)
    trainer = TorchTrainer(
        training_operator_cls=Operator,
        num_workers=num_workers,
        config={"lr": 1e-3, "batch_size": 64},
        use_gpu=use_gpu,
        use_tqdm=True,
    )
    for i in range(num_epochs):
        stats = trainer.train()
        print(stats)

    print(trainer.validate())
    print("Saving model checkpoint to ./model.pt")
    trainer.save("./model.pt")
    print("Model Checkpointed!")
    trainer.shutdown()
    print("success!")
# __torch_ray_end__

# __backwards_compat_start__
from ray.util.sgd import TorchTrainer

MyTrainingOperator = TrainingOperator.from_creators(
    model_creator=model_creator, optimizer_creator=optimizer_creator,
    loss_creator=loss_creator, scheduler_creator=scheduler_creator,
    data_creator=data_creator)

trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler_creator is passed in
    config={"lr": 0.001, "batch_size": 64})

# __backwards_compat_end__

trainer.shutdown()

# __torch_trainer_start__
from ray.util.sgd import TorchTrainer

trainer = TorchTrainer(
    training_operator_cls=MyTrainingOperator,
    scheduler_step_freq="epoch",  # if scheduler is used
    config={"lr": 0.001, "batch_size": 64})

# __torch_trainer_end__

trainer.shutdown()
        # Create optimizer.
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

        # Create loss.
        loss = torch.nn.MSELoss()

        # Register model, optimizer, and loss.
        self.model, self.optimizer, self.criterion = self.register(
            models=model,
            optimizers=optimizer,
            criterion=loss)

        # Register data loaders.
        self.register_data(train_loader=train_loader,
                           validation_loader=val_loader)


ray.init(address='auto')

trainer1 = TorchTrainer(
    training_operator_cls=CustomTrainingOperator,
    num_workers=2,
    use_gpu=False,
    config={"batch_size": 64})

stats = trainer1.train()
print(stats)
trainer1.shutdown()
print("success!")
Exemple #12
0
def main(args):
    if args.smoke_test:
        ray.init(num_cpus=4)
    else:
        ray.init(address=args.address,
                 num_cpus=args.num_workers,
                 log_to_driver=True)

    # Trainer Initialization
    trainer = TorchTrainer(training_operator_cls=CIFAR10Module,
                           num_workers=args.num_workers,
                           config={
                               "lr": args.learning_rate,
                               "lr_decay": args.lr_decay,
                               "eps": args.eps,
                               "momentum": args.momentum,
                               "wd": args.wd,
                               "data_dir": args.data_dir,
                               "batch_size": args.batch_size,
                               "num_workers": args.num_workers,
                               "smoke_test": args.smoke_test
                           },
                           use_gpu=args.use_gpu,
                           scheduler_step_freq="epoch",
                           use_fp16=args.fp16,
                           use_tqdm=False)

    train_loss = []
    val_loss = []
    val_acc = []

    path = os.path.join("/root/volume/Paper/MLVC_Internship",
                        args.checkpoint_dir,
                        args.model_name + "_" + str(args.trial))
    if not os.path.exists(path):
        os.mkdir(path)

    from tabulate import tabulate
    pbar = trange(args.max_epochs, unit="epoch")
    for it in pbar:
        stats = trainer.train(max_retries=1,
                              info=dict(epoch_idx=it,
                                        num_epochs=args.max_epochs))
        train_loss.append(stats["train_loss"])
        val_stats = trainer.validate()
        val_loss.append(val_stats["val_loss"])
        pbar.set_postfix(dict(acc=val_stats["val_accuracy"]))

        trainer.save(
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray".
            format(args.model_name, args.trial, it))
        torch.save(
            [train_loss, val_loss],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss"
            .format(args.model_name, args.trial, it))
        torch.save(
            [val_acc],
            "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc".
            format(args.model_name, args.trial, it))

    print(val_stats)
    trainer.shutdown()
    print("success!")