Beispiel #1
0
def val(epoch):
    since = time.time()
    ### Test ###
    val_loss, val_err = train_utils.test(model, val_loader, criterion, epoch)
    print('Val - Loss: {:.4f} | Acc: {:.4f}'.format(val_loss, 1 - val_err))
    time_elapsed = time.time() - since
    print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60,
                                                time_elapsed % 60))

    global best_loss
    if val_loss < best_loss:
        ### Checkpoint ###
        train_utils.save_weights(model, epoch, val_loss, val_err)
        best_loss = val_loss

    ### Adjust Lr ###
    train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch,
                                     DECAY_EVERY_N_EPOCHS)

    return val_loss, 1 - val_err
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--nClasses', type=int, default=10)  #CIFAR
    parser.add_argument('--reduction', type=float, default=1.0)  #no reduction
    parser.add_argument('--bottleneck', type=bool, default=False)
    parser.add_argument('--growthRate', type=int, default=12)
    parser.add_argument('--modelDepth', type=int, default=40)
    parser.add_argument('--batchSize', type=int, default=64)
    parser.add_argument('--nEpochs', type=int, default=2)
    parser.add_argument('--no-cuda', action='store_true')
    parser.add_argument('--save', type=str, default=RESULTS_PATH)
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--existingWeights', type=str, default=None)
    parser.add_argument('--sessionName',
                        type=str,
                        default=train_utils.get_rand_str(5))
    parser.add_argument('--opt',
                        type=str,
                        default='sgd',
                        choices=('sgd', 'adam', 'rmsprop'))

    args = parser.parse_args()

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    setproctitle.setproctitle(args.save)  #The process name

    torch.manual_seed(args.seed)
    if args.cuda:
        print("Using CUDA")
        torch.cuda.manual_seed(args.seed)


#    if os.path.exists(args.save):
#        shutil.rmtree(args.save)
#    os.makedirs(args.save, exist_ok=True)

    normMean = [0.49139968, 0.48215827, 0.44653124]
    normStd = [0.24703233, 0.24348505, 0.26158768]
    normTransform = transforms.Normalize(normMean, normStd)

    trainTransform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normTransform
    ])
    testTransform = transforms.Compose([transforms.ToTensor(), normTransform])

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    print("Kwargs: " + str(kwargs))
    trainLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH,
                                          train=True,
                                          download=True,
                                          transform=trainTransform),
                             batch_size=args.batchSize,
                             shuffle=True,
                             **kwargs)
    testLoader = DataLoader(dset.CIFAR10(root=CIFAR10_PATH,
                                         train=False,
                                         download=True,
                                         transform=testTransform),
                            batch_size=args.batchSize,
                            shuffle=False,
                            **kwargs)

    net = DenseNet(growthRate=args.growthRate,
                   depth=args.modelDepth,
                   reduction=args.reduction,
                   bottleneck=args.bottleneck,
                   nClasses=args.nClasses)

    if args.existingWeights:
        print("Loading existing weights: %s" % args.existingWeights)
        startEpoch = train_utils.load_weights(net, args.existingWeights)
        endEpoch = startEpoch + args.nEpochs
        print('Resume training at epoch: {}'.format(startEpoch))
        if os.path.exists(args.save + 'train.csv'):  #assume test.csv exists
            print("Found existing train.csv")
            append_write = 'a'  # append if already exists
        else:
            print("Creating new train.csv")
            append_write = 'w'  # make a new file if not
        trainF = open(os.path.join(args.save, 'train.csv'), append_write)
        testF = open(os.path.join(args.save, 'test.csv'), append_write)
    else:
        print("Training new model from scratch")
        startEpoch = 1
        endEpoch = args.nEpochs
        trainF = open(os.path.join(args.save, 'train.csv'), 'w')
        testF = open(os.path.join(args.save, 'test.csv'), 'w')

    print('  + Number of params: {}'.format(
        sum([p.data.nelement() for p in net.parameters()])))
    if args.cuda:
        net = net.cuda()

    if args.opt == 'sgd':
        optimizer = optim.SGD(net.parameters(),
                              lr=1e-1,
                              momentum=0.9,
                              weight_decay=1e-4)
    elif args.opt == 'adam':
        optimizer = optim.Adam(net.parameters(), weight_decay=1e-4)
    elif args.opt == 'rmsprop':
        optimizer = optim.RMSprop(net.parameters(), weight_decay=1e-4)

    print("Training....")
    for epoch in range(startEpoch, endEpoch + 1):
        since = time.time()
        train_utils.adjust_opt(args.opt, optimizer, epoch)
        train_utils.train(epoch,
                          net,
                          trainLoader,
                          optimizer,
                          trainF,
                          sessionName=args.sessionName)
        train_utils.test(epoch, net, testLoader, optimizer, testF)
        time_elapsed = time.time() - since
        print('Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60,
                                              time_elapsed % 60))
        if epoch != 1:
            os.system('./plot.py {} &'.format(args.save))

    trainF.close()
    testF.close()
Beispiel #3
0
        alphas = checkpoint['alphas'].clone()
        betas = checkpoint['betas'].clone()
        biOptimizer.tau = args.tau_min
    del checkpoint

    if args.test:
        if dbsn:
            print(
                torch.cat(
                    [F.softmax(alphas, 1), betas.exp()], 1).data.cpu().numpy())
        test_loss, test_err, test_iou = train_utils.test(
            model,
            loaders['test'],
            criterion,
            alphas,
            betas,
            biOptimizer,
            ngpus,
            dbsn,
            ntimes=100,
            mcdropout=args.mcdropout,
            dir=args.dir)
        logging.info('Test - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format(
            test_loss, 1 - test_err, test_iou))
        exit(0)

for epoch in range(start_epoch, args.epochs + 1):
    if epoch % 50 is 1 and dbsn:
        print(
            torch.cat([F.softmax(alphas, 1), betas.exp()],
                      1).data.cpu().numpy())
    writer.add_scalar("params/cov00", cov[0, 0], epoch)
    writer.add_scalar("params/cov11", cov[1, 1], epoch)
    writer.add_scalar("params/cov22", cov[2, 2], epoch)

    print(
        "Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}".format(
            epoch, trn_loss, 1 - trn_err
        )
    )
    time_elapsed = time.time() - since
    print("Train Time {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))

    if epoch % args.eval_freq == 0:
        #pass
        ### Test ###
        val_loss, val_err, val_iou = train_utils.test(model, loaders["val"], criterion, epoch=epoch, writer=writer)
        print(
            "Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format(
                val_loss, 1 - val_err, val_iou
            )
        )
        writer.add_scalar("val/loss", val_loss, epoch)
        writer.add_scalar("val/error", val_err, epoch)

    time_elapsed = time.time() - since
    print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60, time_elapsed % 60))

    ### Checkpoint ###
    if epoch % args.save_freq == 0:
        print("Saving model at Epoch: ", epoch)
        save_checkpoint(
Beispiel #5
0
    ### Train ###
    if epoch == args.ft_start:
        print('Now replacing data loader with fine-tuned data loader.')
        train_loader = loaders['fine_tune']

    trn_loss, trn_err = train_utils.train(model, train_loader, optimizer,
                                          criterion)
    print('Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}'.format(
        epoch, trn_loss, 1 - trn_err))
    time_elapsed = time.time() - since
    print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60,
                                              time_elapsed % 60))

    if epoch % args.eval_freq is 0:
        ### Test ###
        val_loss, val_err, val_iou = train_utils.test(model, loaders['val'],
                                                      criterion)
        print('Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}'.format(
            val_loss, 1 - val_err, val_iou))

    time_elapsed = time.time() - since
    print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60,
                                                time_elapsed % 60))

    if args.swa and (epoch + 1) > args.swa_start and (
            epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
        print('Saving SWA model at epoch: ', epoch)
        swag_model.collect_model(model)

        if epoch % args.eval_freq is 0:
            swag_model.sample(0.0)
            bn_update(train_loader, swag_model)
Beispiel #6
0
    ### Train ###
    if epoch == args.ft_start:
        print("Now replacing data loader with fine-tuned data loader.")
        train_loader = loaders["fine_tune"]

    trn_loss, trn_err = train_utils.train(model, train_loader, optimizer,
                                          criterion)
    print("Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}".format(
        epoch, trn_loss, 1 - trn_err))
    time_elapsed = time.time() - since
    print("Train Time {:.0f}m {:.0f}s".format(time_elapsed // 60,
                                              time_elapsed % 60))

    if epoch % args.eval_freq is 0:
        ### Test ###
        val_loss, val_err, val_iou = train_utils.test(model, loaders["val"],
                                                      criterion)
        print("Val - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format(
            val_loss, 1 - val_err, val_iou))

    time_elapsed = time.time() - since
    print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60,
                                                time_elapsed % 60))

    if (args.swa and (epoch + 1) > args.swa_start
            and (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0):
        print("Saving SWA model at epoch: ", epoch)
        swag_model.collect_model(model)

        if epoch % args.eval_freq is 0:
            swag_model.sample(0.0)
            bn_update(train_loader, swag_model)
Beispiel #7
0
    model = model_cfg.base(num_classes=num_classes,
                           use_aleatoric=args.loss == "aleatoric").cuda()
    checkpoint = torch.load(args.resume)
    start_epoch = checkpoint["epoch"]
    print(start_epoch)
    model.load_state_dict(checkpoint["state_dict"])

print(len(loaders["test"]))
if args.use_test:
    print("Using test dataset")
    test_loader = "test"
else:
    test_loader = "val"
loss, err, mIOU, model_output_targets = test(
    model,
    loaders[test_loader],
    criterion,
    return_outputs=True,
    return_scale=args.loss == "aleatoric",
)
print(loss, 1 - err, mIOU)

outputs = np.concatenate(model_output_targets["outputs"])
targets = np.concatenate(model_output_targets["targets"])

if args.loss == "aleatoric":
    scales = np.concatenate(model_output_targets["scales"])
else:
    scales = None
np.savez(args.output, preds=outputs, targets=targets, scales=scales)
        #        val_loss, 1 - val_err, val_iou
        #    )
        #)
        #writer.add_scalar("val/loss", val_loss, epoch)
        #writer.add_scalar("val/error", val_err, epoch)

    time_elapsed = time.time() - since
    print("Total Time {:.0f}m {:.0f}s\n".format(time_elapsed // 60,
                                                time_elapsed % 60))

    ### Checkpoint ###
    if epoch % args.save_freq == 0:
        print("Saving model at Epoch: ", epoch)
        save_checkpoint(
            dir=args.dir,
            epoch=epoch,
            state_dict=model.state_dict(),
            optimizer=optimizer.state_dict(),
        )

    lr = schedule(epoch, args.lr_init, args.epochs)
    adjust_learning_rate(optimizer, lr)
    writer.add_scalar("hypers/lr", lr, epoch)

### Test set ###

test_loss, test_err, test_iou = train_utils.test(model, loaders["test"],
                                                 criterion)
print("SGD Test - Loss: {:.4f} | Acc: {:.4f} | IOU: {:.4f}".format(
    test_loss, 1 - test_err, test_iou))
Beispiel #9
0
criterion = nn.NLLLoss2d(weight=camvid.class_weight.cuda()).cuda()

for epoch in range(1, N_EPOCHS + 1):
    since = time.time()

    ### Train ###
    trn_loss, trn_err = train_utils.train(model, train_loader, optimizer,
                                          criterion, epoch)
    print('Epoch {:d}\nTrain - Loss: {:.4f}, Acc: {:.4f}'.format(
        epoch, trn_loss, 1 - trn_err))
    time_elapsed = time.time() - since
    print('Train Time {:.0f}m {:.0f}s'.format(time_elapsed // 60,
                                              time_elapsed % 60))

    ### Test ###
    val_loss, val_err = train_utils.test(model, val_loader, criterion, epoch)
    print('Val - Loss: {:.4f} | Acc: {:.4f}'.format(val_loss, 1 - val_err))
    time_elapsed = time.time() - since
    print('Total Time {:.0f}m {:.0f}s\n'.format(time_elapsed // 60,
                                                time_elapsed % 60))

    ### Checkpoint ###
    train_utils.save_weights(model, epoch, val_loss, val_err)

    ### Adjust Lr ###
    train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer, epoch,
                                     DECAY_EVERY_N_EPOCHS)
'''Test'''
train_utils.test(model, test_loader, criterion, epoch=1)
#train_utils.view_sample_predictions(model, test_loader, n=1)
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser(description='Shallow-CNN Training')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default:64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default:1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        metavar='N',
                        help='number of epochs to train for (default:100)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        metavar='lr',
                        help='learning rate for optimizer (default:0.001)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--early-stopping',
                        type=int,
                        default=10,
                        metavar='N',
                        help='Patience for early stopping (default:10)')
    parser.add_argument(
        '--data-dir',
        type=str,
        default='../data',
        metavar='path/to/dir',
        help='path to directory containing data (default:../data)')
    parser.add_argument(
        '--train-size',
        type=float,
        default=0.85,
        metavar='pct',
        help='fraction of dataset to use for training (default:0.85)')
    parser.add_argument(
        '--test-size',
        type=float,
        default=0.15,
        metavar='pct',
        help='fraction of dataset to use for testing (default:0.15)')
    parser.add_argument(
        '--dropout-rate',
        type=float,
        default=0.5,
        metavar='pct',
        help='dropout rate after convolution layers (default:0.5)')
    parser.add_argument('--conv1-width',
                        type=int,
                        default=10,
                        metavar='w',
                        help='Width of 1st convolution kernel (default:10)')
    parser.add_argument(
        '--n_channels',
        type=int,
        default=30,
        metavar='N',
        help='Number of channels ouput by convolution layers (default:30)')
    parser.add_argument(
        '--max-pool-kernel-size',
        type=int,
        default=25,
        metavar='w',
        help='Width of max-pool kernel after convolution (default:25)')
    parser.add_argument('--max-pool-stride',
                        type=int,
                        default=5,
                        metavar='N',
                        help='stride along 2nd axis for max-pool (default:5)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument(
        '--checkpoint',
        type=str,
        default='checkpoint.pt',
        metavar='path/to/file',
        help='file to save checkpoints (default:checkpoint.pt)')

    #TODO add arg to save everything to specific folder

    # Time id used for saving files
    time_id = int(time())

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    torch.manual_seed(SEED)

    # Load the datsets
    print('loading datasets')
    train_set = RobotNavDataset(args.data_dir)
    submission_set = SubmissionDataset(args.data_dir)
    train_size = floor(0.8 * len(train_set))
    test_size = floor(0.2 * len(train_set))
    train_subset, test_subset = data.random_split(train_set,
                                                  (train_size, test_size))
    train_loader = torch.utils.data.DataLoader(train_subset,
                                               batch_size=args.batch_size,
                                               shuffle=True)
    # Don't think we actually need shuffle here...
    test_loader = torch.utils.data.DataLoader(test_subset,
                                              batch_size=args.test_batch_size)

    # Initialize objects
    print('creating model')
    model = ShallowCNN(n_channels=args.n_channels,
                       conv1_width=args.conv1_width,
                       max_pool_kernel_size=args.max_pool_kernel_size,
                       max_pool_stride=args.max_pool_stride,
                       dropout_rate=args.dropout_rate)
    model.double()  # TODO: look into if this is actually needed...
    early_stopper = EarlyStopping(patience=args.early_stopping,
                                  check_file=args.checkpoint)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    logfile = '{}.log'.format(time_id)
    logger = setup_logger(logfile=logfile, console_out=True)
    loss_func = F.nll_loss

    # Train the model
    print('training model')
    for epoch in range(1, args.epochs + 1):
        train(model,
              train_loader,
              optimizer,
              loss_func,
              epoch,
              log_interval=args.log_interval,
              log_func=logger.info)
        test_loss = test(model, test_loader, loss_func, log_func=logger.info)
        # Early stopper will handle saving the checkpoints
        if early_stopper(test_loss, model):
            break

    print('creating submission')
    make_submission(model, submission_set.data,
                    'submission-{}.csv'.format(time_id))
Beispiel #11
0
    'epoch_data': []
}

for epoch in range(1, N_EPOCHS + 1):

    ### Train ###
    trn_loss, reg_loss, trn_err = train_utils.train(
        model, train_loader, optimizer, criterion, epoch, gpu_id, True)
    print('Epoch {:d}\nTrain - Loss: {:.4f}, Reg - Loss: {:.4f}, IOU: {:.4f}'.format(
        epoch, trn_loss, reg_loss, trn_err))
    time_elapsed = time.time() - since
    print('Train Time {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    ### Test ###
    val_loss, val_err = train_utils.test(model, val_loader, criterion, gpu_id, 1,
            epoch=epoch)
    print('Val - Loss: {:.4f} | IOU: {:.4f}'.format(val_loss, val_err))
    time_elapsed = time.time() - since
    print('Total Time {:.0f}m {:.0f}s\n'.format(
        time_elapsed // 60, time_elapsed % 60))

    ### Adjust Lr ###
    train_utils.adjust_learning_rate(LR, LR_DECAY, optimizer,
                                     epoch, DECAY_EVERY_N_EPOCHS)

    training_data['epoch_data'].append(
            (trn_err, val_err, time_elapsed, collect_dropout_probabilities(model)))

with open('training_data/training_data_{}.pk'.format(model_id), 'wb') as dump_file:
    pickle.dump(training_data, dump_file)
Beispiel #12
0
def test():
    train_utils.test(model, test_loader, criterion, epoch=1)
    train_utils.view_sample_predictions(model, test_loader, n=1)