def test(epoch, num_epochs): losses = [] n_right, n_total = 0, 0 clf.eval() for i, (X_batch, y_cls) in enumerate(test_dataloader): with torch.no_grad(): y = y_cls.cuda() X_batch = X_batch.cuda() y_pred = clf(X_batch) loss = criterion(y_pred, y) losses.append(loss.item()) _, y_pred_cls = y_pred.max(1) n_right, n_total = n_right + ( y_pred_cls == y_cls.cuda()).sum().item(), n_total + len(X_batch) val_acc = n_right / n_total val_loss = np.mean(losses) send_metric("val_loss", val_loss) send_metric("val_acc", val_acc) wandb.log({"val_loss": val_loss, "val_acc": val_acc}) print( f'Finished epoch {epoch}/{num_epochs} avg val loss: {val_loss:.3f}; median val loss: {np.median(losses):.3f}; ' f'val acc: {val_acc:.3f}.')
def train(): NUM_EPOCHS = 10 for epoch in range(1, NUM_EPOCHS + 1): losses = [] for i, (X_batch, y) in enumerate(train_dataloader): optimizer.zero_grad() if IS_GPU: y = y.cuda() X_batch = X_batch.cuda() y_pred = clf(X_batch) loss = criterion(y_pred, y) loss.backward() optimizer.step() curr_loss = loss.item() if i % 200 == 0: print( f"Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. Loss: {curr_loss:.3f}." ) send_metric("loss", curr_loss) losses.append(curr_loss) print(f"Finished epoch {epoch}. " f"avg loss: {np.mean(losses)}; median loss: {np.median(losses)}") torch.save(clf.state_dict(), f"{CWD}/checkpoints/epoch_{epoch}.pth") torch.save(clf.state_dict(), f"{CWD}/checkpoints/model_final.pth")
def handleEpochEnd(epoch, logs): global reward_avg_100 global reward_avg_1000 global epochs reward_avg_100 += logs['episode_reward'] reward_avg_1000 += logs['episode_reward'] epochs += 1 if epochs % 100 == 0: metrics.send_metric("reward_last_100", reward_avg_100 / 100) reward_avg_100 = 0 if epochs % 1000 == 0: metrics.send_metric("reward_last_1000", reward_avg_1000 / 1000) reward_avg_1000 = 0
def train(): torch.cuda.set_device(hvd.local_rank()) torch.set_num_threads(1) clf.train() NUM_EPOCHS = args.epochs for epoch in range(start_epoch, NUM_EPOCHS + 1): train_sampler.set_epoch(epoch) test_sampler.set_epoch(epoch) losses = [] for i, (X_batch, y_cls) in enumerate(train_dataloader): optimizer.zero_grad() y = y_cls.cuda() X_batch = X_batch.cuda() y_pred = clf(X_batch) loss = criterion(y_pred, y) loss.backward() optimizer.step() train_loss = loss.item() if hvd.rank() == 0: if i % 100 == 0: print( f'Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. loss: {train_loss:.3f}.' ) send_metric("train_loss", train_loss) writer.add_scalar( "train_loss", train_loss, (len(train_dataloader) // 200 + 1) * epoch + (i // 200)) losses.append(train_loss) if hvd.rank() == 0: print( f'Finished epoch {epoch}. ' f'avg loss: {np.mean(losses)}; median loss: {np.median(losses)}' ) test(epoch, NUM_EPOCHS) if epoch % 5 == 0: torch.save(clf.state_dict(), f"/spell/checkpoints/epoch_{epoch}.pth") if hvd.rank() == 0: torch.save(clf.state_dict(), f"/spell/checkpoints/epoch_{NUM_EPOCHS}.pth")
def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() # added a log line here metrics.send_metric("train_nll_loss", loss.item()) if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item()))
def train(): clf.train() NUM_EPOCHS = args.epochs wandb.init() wandb.config.update(args) wandb.watch(clf) for epoch in range(start_epoch, NUM_EPOCHS + 1): losses = [] for i, (X_batch, y_cls) in enumerate(train_dataloader): optimizer.zero_grad() y = y_cls.cuda() X_batch = X_batch.cuda() y_pred = clf(X_batch) loss = criterion(y_pred, y) loss.backward() optimizer.step() train_loss = loss.item() if i % 200 == 0: print( f'Finished epoch {epoch}/{NUM_EPOCHS}, batch {i}. loss: {train_loss:.3f}.' ) send_metric("train_loss", train_loss) wandb.log({"train_loss": train_loss}) losses.append(train_loss) print(f'Finished epoch {epoch}. ' f'avg loss: {np.mean(losses)}; median loss: {np.median(losses)}') test(epoch, NUM_EPOCHS) if epoch % 5 == 0: torch.save(clf.state_dict(), f"{CWD}/checkpoints/epoch_{epoch}.pth") torch.save(clf.state_dict(), f"{CWD}/checkpoints/model_final.pth")
def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 vid_len = 0 with torch.no_grad(): for data, target in test_loader: vid_len = target.size()[1] data, target = data.unsqueeze(2).type( torch.FloatTensor).to(device), target.type( torch.LongTensor).to(device) output = model(data) #test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss #print(pred_prob.size()) pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability #ßprint(pred.numpy()) correct = pred.eq(target.view_as(pred)).sum().item() print("Video length: ", vid_len, " Correct predictions: ", correct, " Percent: ", str(float(correct / vid_len))) metrics.send_metric("Val_correctpct", float(correct / vid_len))
def test(model, device, test_loader): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss( output, target, reduction='sum').item() # sum up batch loss pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) # added a log line here metrics.send_metric("test_avg_nll_loss", test_loss) print( '\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
import spell.metrics as metrics import time import random import sys if __name__ == "__main__": positive = 0 num = 0 step = 0 for i in range(30): add = 1 num += add positive += add positive = max(0, positive) print("Step " + str(step) + ": " + str(add) + " - num: " + str(num) + " - pos: " + str(positive)) metrics.send_metric("pos_walk", positive) metrics.send_metric("walk", num) metrics.send_metric("text", "Hi! Number is " + str(num)) step += 1 time.sleep(1)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=14, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) # added a log line here metrics.send_metric("scheduler_lr", scheduler.get_last_lr()) scheduler.step() if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")
import spell.metrics as metrics import time import argparse # Runs for --steps seconds and sends --steps spell metrics with the key 'value' # and a numeric value starting at --start and incrementing by --stepsize if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--start", type=float, help="Value to start at") parser.add_argument("--steps", type=int, help="Number of metrics to send") parser.add_argument("--stepsize", type=float, help="Size of step to take") args = parser.parse_args() value = args.start for i in range(args.steps): print("Sending metric {}".format(value)) metrics.send_metric("value", value) value += args.stepsize time.sleep(1)