Beispiel #1
0
def test_correctness(ray_start_2_cpus, num_workers, use_local):
    layer = nn.Linear(1, 1)
    ptl_op = TrainingOperator.from_ptl(PTL_Module)
    trainer1 = TorchTrainer(training_operator_cls=ptl_op,
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train1_stats = trainer1.train()
    val1_stats = trainer1.validate()
    trainer1.shutdown()

    trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator,
                            scheduler_step_freq="manual",
                            config={
                                "layer": layer,
                                "data_size": 3,
                                "batch_size": 1
                            },
                            num_workers=num_workers,
                            use_local=use_local)
    train2_stats = trainer2.train()
    val2_stats = trainer2.validate()
    trainer2.shutdown()

    assert train1_stats["train_loss"] == train2_stats["train_loss"]
    assert val1_stats["val_loss"] == val2_stats["val_loss"]
    assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
Beispiel #2
0
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5):
    Operator = TrainingOperator.from_ptl(LitMNIST)
    trainer = TorchTrainer(
        training_operator_cls=Operator,
        num_workers=num_workers,
        config={"lr": 1e-3, "batch_size": 64},
        use_gpu=use_gpu,
        use_tqdm=True,
    )
    for i in range(num_epochs):
        stats = trainer.train()
        print(stats)

    print(trainer.validate())
    print("Saving model checkpoint to ./model.pt")
    trainer.save("./model.pt")
    print("Model Checkpointed!")
    trainer.shutdown()
    print("success!")
Beispiel #3
0
        self.loss = nn.MSELoss()

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.val_loader

    def on_save_checkpoint(self, checkpoint):
        checkpoint["int"] = self.rand_int

    def on_load_checkpoint(self, checkpoint):
        self.rand_int = checkpoint["int"]


Operator = TrainingOperator.from_ptl(PTL_Module)


@pytest.mark.parametrize("use_local", [True, False])
def test_single_step(ray_start_2_cpus, use_local):  # noqa: F811
    trainer = TorchTrainer(training_operator_cls=Operator,
                           num_workers=1,
                           use_local=use_local,
                           use_gpu=False)
    metrics = trainer.train(num_steps=1)
    assert metrics[BATCH_COUNT] == 1

    val_metrics = trainer.validate(num_steps=1)
    assert val_metrics[BATCH_COUNT] == 1
    trainer.shutdown()