def test(model, test_loader, cuda): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: if cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += F.nll_loss( output, target, size_average=False).data[0] # sum up batch loss pred = output.data.max( 1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) accuracy = correct / len(test_loader.dataset) logging.info( 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * accuracy)) output_path = get_outputs_path() model_path = os.path.join(output_path, "model.dat") torch.save(model.state_dict(), model_path) send_metrics(loss=test_loss.item(), accuracy=accuracy.item())
def main(argv=sys.argv[1:]): argv.extend(['-f', get_outputs_path()]) cartpole_client.main(argv) send_metrics(score=cartpole_client.RESULTS[0]['score'])
def report_progress(epoch, best, losses, scores): print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], )) send_metrics( epoch=epoch, best_acc=best, loss=losses["textcat"], P=scores["textcat_p"], R=scores["textcat_r"], F=scores["textcat_f"], )
def distributed_train(model, train_set, epoch, optimizer, rank, num_batches, log_interval): """ Distributed Synchronous SGD Example """ epoch_loss = 0.0 train_dataset = [d for d in train_set] for batch_idx, (data, target) in enumerate(train_dataset): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) epoch_loss += loss.data[0] loss.backward() average_gradients(model) optimizer.step() if batch_idx % log_interval == 0: logging.info( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_set.dataset), 100. * batch_idx / len(train_dataset), loss.data[0])) logging.info('Rank {}, epoch: {}, loss: {}'.format( rank, epoch, epoch_loss / num_batches)) send_metrics(loss=epoch_loss / num_batches)
for epoch in range(start_epoch, args.epochs): train_model(model=model, optimizer=optimizer, train_loader=train_loader, train_dataset=train_dataset, loss_fn=loss_fn, num_epochs=args.epochs, epoch=epoch, batch_size=args.batch_size, notify=args.notify, cuda=cuda) accuracy = eval_model(model=model, test_loader=test_loader, cuda=cuda) accuracy = 100. * accuracy / len(test_loader.dataset) logging.info('Test Accuracy: {:.2f}%'.format(accuracy)) send_metrics(accuracy=accuracy) # Save checkpoint logic accuracy = torch.FloatTensor([accuracy]) best_accuracy = torch.FloatTensor(max(accuracy.numpy(), best_accuracy.numpy())) if bool(accuracy.numpy() > best_accuracy.numpy()): logging.info('Saving new state for epoch {}'.format(epoch)) state = { 'epoch': epoch + 1, 'state': model.state_dict(), 'accuracy': best_accuracy } torch.save(state, get_weight_filename()) else: logging.info('State did not change for epoch {}'.format(epoch))
# ResNet20 | 3 (2)| 92.16 | 91.25 | ----- | ----- | 35 (---) # ResNet32 | 5(NA)| 92.46 | 92.49 | NA | NA | 50 ( NA) # ResNet44 | 7(NA)| 92.50 | 92.83 | NA | NA | 70 ( NA) # ResNet56 | 9 (6)| 92.71 | 93.03 | 93.01 | NA | 90 (100) # ResNet110 |18(12)| 92.65 | 93.39+-.16| 93.15 | 93.63 | 165(180) # ResNet164 |27(18)| ----- | 94.07 | ----- | 94.54 | ---(---) # ResNet1001| (111)| ----- | 92.39 | ----- | 95.08+-.14| ---(---) # --------------------------------------------------------------------------- # Model name, depth and version depth = get_depth(version=args.version, model_depth_param=args.model_depth_param) model_type = 'ResNet%dv%d' % (depth, args.version) # Subtracting pixel mean improves accuracy subtract_pixel_mean = True # Data train_data, test_data, input_shape = get_data(subtract_pixel_mean, args.num_classes) # Score trained model. model = get_model(version=args.version, input_shape=input_shape, depth=depth) train(model, model_type, train_data['x'], train_data['y'], test_data['x'], test_data['y'], args.data_augmentation, args.batch_size, args.epochs) scores = model.evaluate(test_data['x'], test_data['y'], verbose=1) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) send_metrics(loss=scores[0], accuracy=scores[1])