예제 #1
0
    def run(self):
        tensorboard_dir = os.path.join(
            os.getenv("ADAPTDL_TENSORBOARD_LOGDIR", "/tmp"),
            adaptdl.env.job_id())
        with SummaryWriter(tensorboard_dir) as writer:
            for epoch in adl.remaining_epochs_until(args.epochs):
                epoch_start_time = time.time()
                self.train(train_txt, epoch, writer)
                val_loss = self.evaluate(self.model, val_txt, epoch, writer)

                print('-' * 89)
                print(f'| end of epoch {epoch:3d} '
                      f'| time: {(time.time() - epoch_start_time):5.2f}s '
                      f'| valid loss {val_loss:5.2f} '
                      f'| valid ppl {np.exp(val_loss):8.2f}')
                print('-' * 89)

                if val_loss < self.best_val_loss:
                    self.best_val_loss = val_loss
                    self.best_model = self.model

                self.scheduler.step()

        test_loss = self.evaluate(self.best_model, test_txt)
        print('=' * 89)
        print(f'| End of training | test loss {test_loss:5.2f} | '
              f'test ppl {np.exp(test_loss):8.2f}')
        print('=' * 89)
예제 #2
0
def test_single_replica_parallel():
    adl.init_process_group("gloo")
    true_values = np.asarray([3.0, 4.0])
    dataset = LRIterableDataset(1000, true_values, 1.0)
    dataloader = adl.AdaptiveDataLoader(dataset,
                                        batch_size=32,
                                        shuffle=False,
                                        num_workers=1)
    model = torch.nn.Linear(1, 1, bias=True)
    params = [model.bias, model.weight]
    sgd = torch.optim.SGD([{"params": [param]} for param in params], lr=0.01)
    schedule = torch.optim.lr_scheduler.MultiStepLR(sgd, [50])
    model = adl.AdaptiveDataParallel(model, sgd, schedule)
    loss = torch.nn.MSELoss()
    for epoch in adl.remaining_epochs_until(100):
        for inputs, targets in dataloader:
            inputs = inputs.float()
            targets = targets.float()
            sgd.zero_grad()
            output = model(torch.reshape(inputs, (-1, 1)))
            targets = torch.reshape(targets, (-1, 1))
            loss_value = loss(output, targets)
            loss_value.backward()
            sgd.step()
        schedule.step()
    params = np.asarray([param.item() for param in params])
    assert(np.all(np.isclose(params, true_values, atol=0.1))), \
        (params, true_values)
예제 #3
0
def train_mnist(config: Dict, checkpoint_dir: Optional[str] = None):
    # Data Setup
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = adl.AdaptiveDataLoader(datasets.MNIST(
        "~/data", train=True, download=True, transform=mnist_transforms),
                                          batch_size=64,
                                          shuffle=True)

    # Autoscale batch size
    train_loader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024))

    test_loader = adl.AdaptiveDataLoader(datasets.MNIST(
        "~/data", train=False, transform=mnist_transforms),
                                         batch_size=64,
                                         shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = ConvNet()
    optimizer = optim.SGD(model.parameters(),
                          lr=config.get("lr", 0.01),
                          momentum=config.get("momentum", 0.79))

    model.to(device)
    model = adl.AdaptiveDataParallel(model, optimizer)

    for epoch in adl.remaining_epochs_until(config.get("epochs", 10)):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        # Send the current training result back to Tune
        tune.report(mean_accuracy=acc)
예제 #4
0
def _train_simple(config: Dict, checkpoint_dir: Optional[str] = None):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    H = config.get("H", 16)
    N = config.get("N", 16)

    # Create random Tensors to hold inputs and outputs
    dataloader = adl.AdaptiveDataLoader(dataset, batch_size=N)
    dataloader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024))

    loss_fn = nn.MSELoss()

    # Use the nn package to define our model and loss function.
    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),
    )
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    model = model.to(device)
    model = adl.AdaptiveDataParallel(model, optimizer)

    loss = torch.Tensor([0.0])
    for epoch in adl.remaining_epochs_until(config.get("epochs", 10)):
        for (x, y) in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

        tune.report(mean_loss=loss.item())
예제 #5
0
def _train_simple(config: Dict, checkpoint_dir: Optional[str] = None):
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import adaptdl.torch as adl
    from ray import tune

    class MyDataset:
        def __init__(self, xs, ys):
            self.xs = xs
            self.ys = ys

        def __getitem__(self, i):
            return self.xs[i], self.ys[i]

        def __len__(self):
            return len(self.xs)

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 5, 5, 5
    dataset = MyDataset(torch.randn(N, D_in), torch.randn(N, D_out))

    H = config.get("H", 16)
    N = config.get("N", 16)

    # Create random Tensors to hold inputs and outputs
    dataloader = adl.AdaptiveDataLoader(dataset, batch_size=N)
    dataloader.autoscale_batch_size(4096, local_bsz_bounds=(16, 1024))

    loss_fn = nn.MSELoss()

    # Use the nn package to define our model and loss function.
    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out),
    )
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    model = adl.AdaptiveDataParallel(model, optimizer)

    loss = torch.Tensor([0.0])
    for epoch in adl.remaining_epochs_until(config.get("epochs", 10)):
        for (x, y) in dataloader:
            optimizer.zero_grad()
            output = model(x)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

        tune.report(mean_loss=loss.item())
예제 #6
0
        self.data = list(zip(x, y))

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)


dataset = SimpleDataset(10000)
dataloader = adl.AdaptiveDataLoader(dataset,
                                    batch_size=args.bs,
                                    shuffle=True,
                                    num_workers=2,
                                    drop_last=True)

optimizer = optim.SGD(net.parameters(),
                      lr=args.lr,
                      momentum=0.9,
                      weight_decay=5e-4)
lr_scheduler = MultiStepLR(optimizer, [30, 45], 0.1)

net = adl.AdaptiveDataParallel(net, optimizer, lr_scheduler)
trainer = Trainer(net, optimizer, lr_scheduler)

for epoch in adl.remaining_epochs_until(args.epochs):

    for inputs, targets in dataloader:
        batch_stat = trainer.train(inputs, targets)
        print(batch_stat)
예제 #7
0
        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        stats["g_loss_sum"] += errG.item()
        stats["d_loss_sum"] += errD.item()
    stats["norm"] += metrics._metrics_state().grad_params[0]
    stats["var"] += metrics._metrics_state().grad_params[1]
    stats["replicas"] += 1.0
    scheduleD.step()
    scheduleG.step()

    with stats.synchronized():
        with SummaryWriter(adaptdl.get_tensorboard_dir()) as writer:
            writer.add_scalar("Loss/G",
                              stats["g_loss_sum"] / stats["replicas"], epoch)
            writer.add_scalar("Loss/D",
                              stats["d_loss_sum"] / stats["replicas"], epoch)
            writer.add_scalar("Performance/GlobalBatchsize",
                              b_size * stats["replicas"], epoch)
            writer.add_scalar("Performance/Replicas", stats["replicas"], epoch)
            writer.add_scalar("Stats/Variance",
                              stats["norm"] / stats["replicas"], epoch)
            writer.add_scalar("Stats/Norm", stats["var"] / stats["replicas"],
                              epoch)


for epoch in adl.remaining_epochs_until(num_epochs):
    train(epoch)