def main():
    input_args = train_input()
    print_model(input_args)

    device = torch.device("cuda:0" if torch.cuda.is_available()
                          and input_args.gpu == True else "cpu")

    model = create_model(input_args.arch, input_args.hidden_units)

    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.classifier.parameters(),
                           input_args.learning_rate,)

    exp_lr_scheduler = lr_scheduler.StepLR(
        optimizer, step_size=5, gamma=0.1)

    image_datasets, dataloaders = create_dataloaders(
        input_args.data_dir)

    train(model, dataloaders, image_datasets, criterion, optimizer,
          exp_lr_scheduler, device, input_args.epochs)

    if input_args.save_dir:
        model.cpu()
        save_checkpoint({
            'epoch': input_args.epochs,
            'arch': input_args.arch,
            'classifier': model.classifier,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'mapping': image_datasets['train'].class_to_idx
        }, input_args.save_dir)
Beispiel #2
0
def main(args, config, writer):
    best_loss = math.inf
    best_model, best_epoch = None, None
    cuda = cnn_utils.check_cuda(config)

    # Attempts to otimise - see
    # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do
    torch.backends.cudnn.benchmark = True

    data_loaders = create_dataloaders(args, config)

    model, criterion, optimizer, lr_scheduler = setup_model(args)
    if cuda:  # GPU support
        model = model.cuda()
        # The below is only needed if loss fn has params
        #criterion = criterion.cuda()

    if args.checkpoint:  # Resume from a checkpoint
        best_loss = cnn_utils.load_from_checkpoint(model, optimizer, args,
                                                   config)

    if args.pretrained:  # Direct copy weights from another model
        cnn_utils.load_weights(model, args, config, frozen=args.frozen)

    # Perform training and testing
    print("Beginning training loop")
    for epoch in range(args.start_epoch, args.start_epoch + args.nEpochs):
        epoch_loss = 0
        epoch_loss_all = train(model=model,
                               dset_loaders=data_loaders,
                               optimizer=optimizer,
                               lr_scheduler=lr_scheduler,
                               criterion=criterion,
                               epoch=epoch,
                               cuda=cuda,
                               clip=args.clip,
                               writer=writer)

        for val in epoch_loss_all:
            epoch_loss += val / len(epoch_loss_all)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_epoch = epoch
            best_model = copy.deepcopy(model)

        # Update the scheduler - restarting
        if lr_scheduler.last_epoch == lr_scheduler.T_max:
            for group in optimizer.param_groups:
                group['lr'] = args.lr
            lr_scheduler = CosineAnnealingLR(optimizer,
                                             T_max=lr_scheduler.T_max * 2)

        # cnn_utils.log_all_layer_weights(model, writer, epoch)

        if epoch % 1 == 0 and epoch != 0:
            cnn_utils.save_checkpoint(model, epoch, optimizer, best_loss,
                                      config['PATH']['checkpoint_dir'],
                                      args.tag + "{}.pth".format(epoch))

        if args.prompt:
            if not helpers.prompt_user(CONTINUE_MESSAGE):
                print("Ending training")
                break

    print("Best loss was {:.5f} at epoch {}".format(best_loss, best_epoch))

    save = True
    if args.prompt:
        if not helpers.prompt_user(SAVE_MESSAGE):
            print("Not saving the model")
            save = False

    # Save the best model
    if save:
        cnn_utils.save_checkpoint(
            best_model, best_epoch, optimizer, best_loss,
            config['PATH']['model_dir'],
            args.tag + "_best_at{}.pth".format(best_epoch))

    parent_dir = os.path.abspath(os.pardir)
    scalar_dir = os.path.join(parent_dir, "logs", args.tag)
    # if not os.path.isdir(scalar_dir):
    #     pathlib.Path(scalar_dir).mkdir(parents=True, exist_ok=True)
    # writer.export_scalars_to_json(
    #     os.path.join(scalar_dir, "all_scalars.json"))
    writer.close()