Exemple #1
0
def test(epoch, num_epochs):
    losses = []
    n_right, n_total = 0, 0
    clf.eval()

    for i, (X_batch, y_cls) in enumerate(test_dataloader):
        with torch.no_grad():
            y = y_cls.cuda()
            X_batch = X_batch.cuda()

            y_pred = clf(X_batch)
            loss = criterion(y_pred, y)
            losses.append(loss.item())
            _, y_pred_cls = y_pred.max(1)
            n_right, n_total = n_right + (
                y_pred_cls
                == y_cls.cuda()).sum().item(), n_total + len(X_batch)

    val_acc = n_right / n_total
    val_loss = np.mean(losses)

    send_metric("val_loss", val_loss)
    send_metric("val_acc", val_acc)
    wandb.log({"val_loss": val_loss, "val_acc": val_acc})
    print(
        f'Finished epoch {epoch}/{num_epochs} avg val loss: {val_loss:.3f}; median val loss: {np.median(losses):.3f}; '
        f'val acc: {val_acc:.3f}.')
Exemple #2
0
def train():
    NUM_EPOCHS = 10
    for epoch in range(1, NUM_EPOCHS + 1):
        losses = []

        for i, (X_batch, y) in enumerate(train_dataloader):
            optimizer.zero_grad()

            if IS_GPU:
                y = y.cuda()
                X_batch = X_batch.cuda()

            y_pred = clf(X_batch)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            curr_loss = loss.item()
            if i % 200 == 0:
                print(
                    f"Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. Loss: {curr_loss:.3f}."
                )
                send_metric("loss", curr_loss)

            losses.append(curr_loss)

        print(f"Finished epoch {epoch}. "
              f"avg loss: {np.mean(losses)}; median loss: {np.median(losses)}")

        torch.save(clf.state_dict(), f"{CWD}/checkpoints/epoch_{epoch}.pth")
    torch.save(clf.state_dict(), f"{CWD}/checkpoints/model_final.pth")
Exemple #3
0
 def handleEpochEnd(epoch, logs):
     global reward_avg_100
     global reward_avg_1000
     global epochs
     reward_avg_100 += logs['episode_reward']
     reward_avg_1000 += logs['episode_reward']
     epochs += 1
     if epochs % 100 == 0:
         metrics.send_metric("reward_last_100", reward_avg_100 / 100)
         reward_avg_100 = 0
     if epochs % 1000 == 0:
         metrics.send_metric("reward_last_1000", reward_avg_1000 / 1000)
         reward_avg_1000 = 0
def train():
    torch.cuda.set_device(hvd.local_rank())
    torch.set_num_threads(1)
    clf.train()

    NUM_EPOCHS = args.epochs

    for epoch in range(start_epoch, NUM_EPOCHS + 1):
        train_sampler.set_epoch(epoch)
        test_sampler.set_epoch(epoch)

        losses = []

        for i, (X_batch, y_cls) in enumerate(train_dataloader):
            optimizer.zero_grad()

            y = y_cls.cuda()
            X_batch = X_batch.cuda()

            y_pred = clf(X_batch)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss = loss.item()
            if hvd.rank() == 0:
                if i % 100 == 0:
                    print(
                        f'Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. loss: {train_loss:.3f}.'
                    )
                    send_metric("train_loss", train_loss)
                    writer.add_scalar(
                        "train_loss", train_loss,
                        (len(train_dataloader) // 200 + 1) * epoch +
                        (i // 200))
            losses.append(train_loss)

        if hvd.rank() == 0:
            print(
                f'Finished epoch {epoch}. '
                f'avg loss: {np.mean(losses)}; median loss: {np.median(losses)}'
            )
            test(epoch, NUM_EPOCHS)
            if epoch % 5 == 0:
                torch.save(clf.state_dict(),
                           f"/spell/checkpoints/epoch_{epoch}.pth")

    if hvd.rank() == 0:
        torch.save(clf.state_dict(),
                   f"/spell/checkpoints/epoch_{NUM_EPOCHS}.pth")
Exemple #5
0
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

        # added a log line here
        metrics.send_metric("train_nll_loss", loss.item())

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
Exemple #6
0
def train():
    clf.train()
    NUM_EPOCHS = args.epochs

    wandb.init()
    wandb.config.update(args)
    wandb.watch(clf)

    for epoch in range(start_epoch, NUM_EPOCHS + 1):
        losses = []

        for i, (X_batch, y_cls) in enumerate(train_dataloader):
            optimizer.zero_grad()

            y = y_cls.cuda()
            X_batch = X_batch.cuda()

            y_pred = clf(X_batch)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss = loss.item()
            if i % 200 == 0:
                print(
                    f'Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. loss: {train_loss:.3f}.'
                )
                send_metric("train_loss", train_loss)
                wandb.log({"train_loss": train_loss})
            losses.append(train_loss)

        print(f'Finished epoch {epoch}. '
              f'avg loss: {np.mean(losses)}; median loss: {np.median(losses)}')
        test(epoch, NUM_EPOCHS)
        if epoch % 5 == 0:
            torch.save(clf.state_dict(),
                       f"{CWD}/checkpoints/epoch_{epoch}.pth")

    torch.save(clf.state_dict(), f"{CWD}/checkpoints/model_final.pth")
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    vid_len = 0
    with torch.no_grad():
        for data, target in test_loader:
            vid_len = target.size()[1]
            data, target = data.unsqueeze(2).type(
                torch.FloatTensor).to(device), target.type(
                    torch.LongTensor).to(device)
            output = model(data)
            #test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss

            #print(pred_prob.size())
            pred = output.argmax(
                dim=1,
                keepdim=True)  # get the index of the max log-probability
            #ßprint(pred.numpy())
            correct = pred.eq(target.view_as(pred)).sum().item()
            print("Video length: ", vid_len, " Correct predictions: ", correct,
                  " Percent: ", str(float(correct / vid_len)))
            metrics.send_metric("Val_correctpct", float(correct / vid_len))
Exemple #8
0
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(
                output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(
                dim=1,
                keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    # added a log line here
    metrics.send_metric("test_avg_nll_loss", test_loss)

    print(
        '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
import spell.metrics as metrics
import time
import random
import sys

if __name__ == "__main__":
    positive = 0
    num = 0
    step = 0
    for i in range(30):
        add = 1
        num += add
        positive += add
        positive = max(0, positive)
        print("Step " + str(step) + ": " + str(add) + " - num: " + str(num) +
              " - pos: " + str(positive))
        metrics.send_metric("pos_walk", positive)
        metrics.send_metric("walk", num)
        metrics.send_metric("text", "Hi! Number is " + str(num))
        step += 1
        time.sleep(1)
Exemple #10
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=14,
                        metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr',
                        type=float,
                        default=1.0,
                        metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma',
                        type=float,
                        default=0.7,
                        metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

        # added a log line here
        metrics.send_metric("scheduler_lr", scheduler.get_last_lr())

        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Exemple #11
0
import spell.metrics as metrics
import time
import argparse

# Runs for --steps seconds and sends --steps spell metrics with the key 'value'
# and a numeric value starting at --start and incrementing by --stepsize
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--start", type=float, help="Value to start at")
    parser.add_argument("--steps", type=int, help="Number of metrics to send")
    parser.add_argument("--stepsize", type=float, help="Size of step to take")
    args = parser.parse_args()

    value = args.start
    for i in range(args.steps):
        print("Sending metric {}".format(value))
        metrics.send_metric("value", value)
        value += args.stepsize
        time.sleep(1)