Beispiel #1
0
def main():
    np.random.seed(0)
    torch.manual_seed(0)

    logger.info('Loading data...')
    train_loader, val_loader, classes = custom_dataset.load_data(args)

    # override autodetect if n_classes is given
    if args.n_classes > 0:
        classes = np.arange(args.n_classes)

    model = load_model(classes)

    logger.info('Loaded model; params={}'.format(util.count_parameters(model)))
    if not args.cpu:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    model.to(device)
    cudnn.benchmark = True
    logger.info('Running on ' + str(device))

    summary_writer = Logger(args.logdir)

    # Loss and Optimizer
    n_epochs = args.epochs
    if args.label_smoothing > 0:
        criterion = nn.BCEWithLogitsLoss()
    else:
        criterion = nn.CrossEntropyLoss()

    train_state = init_train_state()
    # freeze layers
    for l in args.freeze_layers:
        for p in getattr(model, l).parameters():
            p.requires_grad = False
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=train_state['lr'],
                                     weight_decay=args.weight_decay)
    elif args.optimizer == 'nesterov':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=train_state['lr'],
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=True)
    # this is used to warm-start
    if args.warm_start_from:
        logger.info('Warm-starting from {}'.format(args.warm_start_from))
        assert os.path.isfile(args.warm_start_from)
        train_state = load_checkpoint(args.warm_start_from, model, optimizer)
        logger.info('Params loaded.')
        # do not override train_state these when warm staring
        train_state = init_train_state()

    ckptfile = str(Path(args.logdir) / args.latest_fname)
    if os.path.isfile(ckptfile):
        logger.info('Loading checkpoint: {}'.format(ckptfile))
        train_state = load_checkpoint(ckptfile, model, optimizer)
        logger.info('Params loaded.')
    else:
        logger.info('Checkpoint {} not found; ignoring.'.format(ckptfile))

    # Training / Eval loop
    epoch_time = []                 # store time per epoch
    # we save epoch+1 to checkpoints; but for eval we should repeat prev. epoch
    if args.skip_train:
        train_state['start_epoch'] -= 1
    for epoch in range(train_state['start_epoch'], n_epochs):
        logger.info('Epoch: [%d/%d]' % (epoch + 1, n_epochs))
        start = time.time()

        if not args.skip_train:
            model.train()
            train(train_loader, device, model, criterion, optimizer, summary_writer, train_state,
                  n_classes=len(classes))
            logger.info('Time taken: %.2f sec...' % (time.time() - start))
            if epoch == 0:
                train_state['steps_epoch'] = train_state['step']
        # always eval on last epoch
        if not args.skip_eval or epoch == n_epochs - 1:
            logger.info('\n Starting evaluation...')
            model.eval()
            eval_shrec = True if epoch == n_epochs - 1 and args.retrieval_dir else False
            metrics, inputs = eval(
                val_loader, device, model, criterion, eval_shrec)

            logger.info('\tcombined: %.2f, Acc: %.2f, mAP: %.2f, Loss: %.4f' %
                        (metrics['combined'],
                         metrics['acc_inst'],
                         metrics.get('mAP_inst', 0.),
                         metrics['loss']))

            # Log epoch to tensorboard
            # See log using: tensorboard --logdir='logs' --port=6006
            ims = get_summary_ims(inputs)
            if not args.nolog:
                util.logEpoch(summary_writer, model, epoch + 1, metrics, ims)
        else:
            metrics = None

        # Decaying Learning Rate
        if args.lr_decay_mode == 'step':
            if (epoch + 1) % args.lr_decay_freq == 0:
                train_state['lr'] *= args.lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = train_state['lr']

        # Save model
        if not args.skip_train:
            logger.info('\tSaving latest model')
            util.save_checkpoint({
                'epoch': epoch + 1,
                'step': train_state['step'],
                'steps_epoch': train_state['steps_epoch'],
                'state_dict': model.state_dict(),
                'metrics': metrics,
                'optimizer': optimizer.state_dict(),
                'lr': train_state['lr'],
            },
                str(Path(args.logdir) / args.latest_fname))

        total_epoch_time = time.time() - start
        epoch_time.append(total_epoch_time)
        logger.info('Total time for this epoch: {} s'.format(total_epoch_time))

        # if last epoch, show eval results
        if epoch == n_epochs - 1:
            logger.info(
                '|model|combined|acc inst|acc cls|mAP inst|mAP cls|loss|')
            logger.info('|{}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.2f}|{:.4f}|'
                        .format(os.path.basename(args.logdir),
                                metrics['combined'],
                                metrics['acc_inst'],
                                metrics['acc_cls'],
                                metrics.get('mAP_inst', 0.),
                                metrics.get('mAP_cls', 0.),
                                metrics['loss']))

        if args.skip_train:
            # if evaluating, run it once
            break

        if time.perf_counter() + np.max(epoch_time) > start_time + args.exit_after:
            logger.info('Next epoch will likely exceed alotted time; exiting...')
            break
    # print('\nEvaluation:')
    # print('\tTrain Acc: %.2f - Loss: %.4f' % (avg_train_acc.item(), avg_loss_train.item()))

    avg_val_acc, avg_loss_val = eval(eval_val_loader)

    print('\nEvaluation:')
    print('\tVal Acc: %.2f - Loss: %.4f' % (avg_val_acc, avg_loss_val))
    # print('\tVal Acc: %.2f - Loss: %.4f' % (avg_val_acc.item(), avg_loss_val.item()))

    print('\tCurrent best val acc: %.2f' % best_acc)

    # Log epoch to tensorboard
    # See log using: tensorboard --logdir='logs' --port=6006
    #util.logEpoch(logger, model, epoch + 1, avg_loss_val, avg_val_acc,avg_loss_train,avg_train_acc)
    util.logEpoch(logger, model, epoch + 1, avg_loss_val, avg_val_acc)

    # Save model
    if avg_val_acc > best_acc:
        print('\tSaving checkpoint - Acc: %.2f' % avg_val_acc)
        best_acc = avg_val_acc
        best_loss = avg_loss_val
        util.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'acc': avg_val_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            }, args.model, START, str(args.depth))
    resnet.train()
    train()
    print('Time taken: %.2f sec.' % (time.time() - start))

    resnet.eval()
    avg_test_acc, avg_loss = eval(val_loader)

    print('\nEvaluation:')
    print('\tVal Acc: %.2f - Loss: %.4f' %
          (avg_test_acc.item(), avg_loss.item()))
    print('\tCurrent best val acc: %.2f' % best_acc)

    # Log epoch to tensorboard
    # See log using: tensorboard --logdir='logs' --port=6006
    util.logEpoch(logger, resnet, epoch + 1, avg_loss, avg_test_acc)

    # Save model
    if avg_test_acc > best_acc:
        print('\tSaving checkpoint - Acc: %.2f' % avg_test_acc)
        best_acc = avg_test_acc
        best_loss = avg_loss
        util.save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': resnet.state_dict(),
            'acc': avg_test_acc,
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
        })

    # Decaying Learning Rate
Beispiel #4
0
    model.train()
    train()
    print('Time taken: %.2f sec.' % (time.time() - start))

    model.eval()
    avg_test_acc, avg_loss = eval(val_loader)

    print('\nEvaluation:')
    print('\tVal Acc: %.2f - Loss: %.4f' %
          (avg_test_acc.item(), avg_loss.item()))
    print('\tCurrent best val acc: %.2f' % best_acc)

    # Log epoch to tensorboard
    # See log using: tensorboard --logdir='logs' --port=6006
    util.logEpoch(logger, model, epoch + 1, avg_loss, avg_test_acc)

    # Save model
    if avg_test_acc > best_acc:
        print('\tSaving checkpoint - Acc: %.2f' % avg_test_acc)
        best_acc = avg_test_acc
        best_loss = avg_loss
        util.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'acc': avg_test_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            }, args.model, args.depth)
def main():
    train_chairs = [
        'chair_0001', 'chair_0005', 'chair_0101', 'chair_0084', 'chair_0497',
        'chair_0724', 'chair_0878'
    ]
    test_chairs = ['chair_0957']
    features = []
    np.random.seed(0)
    torch.manual_seed(0)

    logger.info('Loading data...')
    train_loader, val_loader, classes = custom_dataset.load_data(args)

    # override autodetect if n_classes is given
    if args.n_classes > 0:
        classes = np.arange(args.n_classes)

    model = load_model(classes)

    logger.info('Loaded model; params={}'.format(util.count_parameters(model)))
    if not args.cpu:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    else:
        device = "cpu"

    model.to(device)
    cudnn.benchmark = True
    logger.info('Running on ' + str(device))

    summary_writer = Logger(args.logdir)

    # Loss and Optimizer
    n_epochs = args.epochs
    if args.label_smoothing > 0:
        criterion = nn.BCEWithLogitsLoss()
    else:
        criterion = nn.CrossEntropyLoss()

    train_state = init_train_state()
    # freeze layers
    for l in args.freeze_layers:
        for p in getattr(model, l).parameters():
            p.requires_grad = False
    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=train_state['lr'],
                                     weight_decay=args.weight_decay)
    elif args.optimizer == 'nesterov':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=train_state['lr'],
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay,
                                    nesterov=True)
    # this is used to warm-start
    if args.warm_start_from:
        logger.info('Warm-starting from {}'.format(args.warm_start_from))
        assert os.path.isfile(args.warm_start_from)
        train_state = load_checkpoint(args.warm_start_from, model, optimizer)
        logger.info('Params loaded.')
        # do not override train_state these when warm staring
        train_state = init_train_state()

    ckptfile = str(Path(args.logdir) / args.latest_fname)
    if os.path.isfile(ckptfile):
        logger.info('Loading checkpoint: {}'.format(ckptfile))
        train_state = load_checkpoint(ckptfile, model, optimizer)
        logger.info('Params loaded.')
    else:
        logger.info('Checkpoint {} not found; ignoring.'.format(ckptfile))

    # Training / Eval loop
    epoch_time = []  # store time per epoch
    # we save epoch+1 to checkpoints; but for eval we should repeat prev. epoch

    if args.skip_train:
        train_state['start_epoch'] -= 1
    for epoch in range(0, n_epochs):

        logger.info('Epoch: [%d/%d]' % (epoch + 1, n_epochs))
        start = time.time()

        if not args.skip_train:
            model.train()

            if epoch == n_epochs - 1:
                features = train(train_loader,
                                 device,
                                 model,
                                 criterion,
                                 optimizer,
                                 summary_writer,
                                 train_state,
                                 1,
                                 train_chairs,
                                 n_classes=len(classes))

                PIK = "descriptors.dat"
                with open(PIK, "wb") as f:
                    pickle.dump(train_desc, f)

            else:

                train(train_loader,
                      device,
                      model,
                      criterion,
                      optimizer,
                      summary_writer,
                      train_state,
                      0,
                      train_chairs,
                      n_classes=len(classes))

            logger.info('Time taken: %.2f sec...' % (time.time() - start))
            if epoch == 0:
                train_state['steps_epoch'] = train_state['step']
        # always eval on last epoch
        if not args.skip_eval or epoch == n_epochs + 1:
            #print("-------------SAVING MODEL----------------");
            #torch.save(model,"saved.pth")
            logger.info('\n Starting evaluation...')
            model.eval()
            eval_shrec = True if epoch == n_epochs - 1 and args.retrieval_dir else False
            metrics, inputs = eval(val_loader, device, model, criterion,
                                   eval_shrec, 0, test_chairs, features)

            logger.info('\tcombined: %.2f, Acc: %.2f, mAP: %.2f, Loss: %.4f' %
                        (metrics['combined'], metrics['acc_inst'],
                         metrics.get('mAP_inst', 0.), metrics['loss']))

            # Log epoch to tensorboard
            # See log using: tensorboard --logdir='logs' --port=6006
            ims = get_summary_ims(inputs)
            if not args.nolog:
                util.logEpoch(summary_writer, model, epoch + 1, metrics, ims)
        else:
            metrics = None

        # Decaying Learning Rate
        if args.lr_decay_mode == 'step':
            if (epoch + 1) % args.lr_decay_freq == 0:
                train_state['lr'] *= args.lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = train_state['lr']

        # Save model
        if not args.skip_train:
            logger.info('\tSaving latest model')
            util.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'step': train_state['step'],
                    'steps_epoch': train_state['steps_epoch'],
                    'state_dict': model.state_dict(),
                    'metrics': metrics,
                    'optimizer': optimizer.state_dict(),
                    'lr': train_state['lr'],
                }, str(Path(args.logdir) / args.latest_fname))

        total_epoch_time = time.time() - start
        epoch_time.append(total_epoch_time)
        logger.info('Total time for this epoch: {} s'.format(total_epoch_time))

        if args.skip_train:
            # if evaluating, run it once
            break

        if time.perf_counter() + np.max(
                epoch_time) > start_time + args.exit_after:
            logger.info(
                'Next epoch will likely exceed alotted time; exiting...')
            break

    print("Encoder training done")
    print("Now training the Decoder")

    ###############################Decoder ###########################################

    decoder = models.Decoder()
    print(decoder)
    decoder.to(device)

    train_state = init_train_state()

    crit = nn.MSELoss()

    optim = torch.optim.SGD(decoder.parameters(),
                            lr=train_state['lr'],
                            momentum=args.momentum,
                            weight_decay=args.weight_decay,
                            nesterov=True)

    path = str("/home/smjadhav/Research/emvn/decoder_model/latest.pth.tar")
    if os.path.isfile(path):
        logger.info('Loading decoder checkpoint: {}'.format(path))
        train_state = load_checkpoint(path, decoder, optimizer)
        logger.info('Params loaded.')
    else:
        print("Decoder model not found")

    train_size = len(train_loader)
    metrics = {}

    for epoch in range(0, 50):
        print("Epoch ", epoch + 1)
        decoder.train()

        PIK = "D1.dat"

        with open(PIK, "rb") as f:
            try:
                i = 0

                while (True):

                    data = pickle.load(f)

                    inputs = torch.from_numpy(data[1]).to(device)
                    target_img = torch.from_numpy(data[0]).to(device)
                    outputs = decoder(inputs)

                    optim.zero_grad()
                    loss = crit(outputs, target_img)
                    loss.backward()
                    optim.step()

                    if args.lr_decay_mode == 'cos':
                        # estimate steps_epoch from first epoch (we may have dropped entries)
                        steps_epoch = (train_state['steps_epoch']
                                       if train_state['steps_epoch'] > 0 else
                                       len(train_loader))
                        # TODO: there will be a jump here if many entries are dropped
                        #       and we only figure out # of steps after first epoch

                        if train_state['step'] < steps_epoch:
                            train_state['lr'] = args.lr * train_state[
                                'step'] / steps_epoch
                        else:
                            nsteps = steps_epoch * args.epochs
                            train_state['lr'] = (0.5 * args.lr * (1 + np.cos(
                                train_state['step'] * np.pi / nsteps)))
                        for param_group in optim.param_groups:
                            param_group['lr'] = train_state['lr']

                    if (i + 1) % args.print_freq == 0:
                        print("\tIter [%d/%d] Loss: %.4f" %
                              (i + 1, train_size, loss.item()))

                    if args.max_steps > 0 and i > args.max_steps:
                        break
                    i = i + 1

            except:
                exit

        if ((epoch + 1) % 5 == 0):
            print("Saving Decoder model")
            util.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'step': train_state['step'],
                    'steps_epoch': train_state['steps_epoch'],
                    'state_dict': decoder.state_dict(),
                    'metrics': metrics,
                    'optimizer': optimizer.state_dict(),
                    'lr': train_state['lr'],
                }, path)
            PIK = "images.dat"
            with open(PIK, "wb") as f:
                pickle.dump(outputs, f)