Exemple #1
0

logging.info("Training starts now!")
optimizer, scheduler = get_optim(conf.lr * conf.num_gpus * conf.batch_size)
for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs):
    rez = train_epoch(epoch)
    #print("overall{:2d}: ({:.3f})\n{}".format(epoch, rez.mean(1)['total'], rez.mean(1)), flush=True)
    logging.info("overall{:2d}: ({:.3f})\n{}".format(epoch,
                                                     rez.mean(1)['total'],
                                                     rez.mean(1)))

    if conf.save_dir is not None:
        torch.save(
            {
                'epoch': epoch,
                'state_dict': detector.state_dict(
                ),  #{k:v for k,v in detector.state_dict().items() if not k.startswith('detector.')},
                # 'optimizer': optimizer.state_dict(),
            },
            os.path.join(conf.save_dir, '{}-{}.tar'.format('vgrel', epoch)))

    mAp = val_epoch()
    scheduler.step(mAp)
    if any([
            pg['lr'] <= (conf.lr * conf.num_gpus * conf.batch_size) / 99.0
            for pg in optimizer.param_groups
    ]):
        #print("exiting training early", flush=True)
        logging.info("exiting training early")
        break
Exemple #2
0
def main():
    fname = os.path.join(conf.save_dir, 'train_losses.csv')
    train_f = open(fname, 'w')
    train_f.write(
        'iter,class_loss,rel_loss,total,recall20,recall50,recall100,recall20_con,recall50_con,recall100_con\n'
    )
    train_f.flush()

    fname = os.path.join(conf.save_dir, 'val_losses.csv')
    val_f = open(fname, 'w')
    val_f.write(
        'recall20,recall50,recall100,recall20_con,recall50_con,recall100_con\n'
    )
    val_f.flush()

    train, val, _ = VG.splits(num_val_im=conf.val_size,
                              filter_duplicate_rels=True,
                              use_proposals=conf.use_proposals,
                              filter_non_overlap=conf.mode == 'sgdet')
    train_loader, val_loader = VGDataLoader.splits(
        train,
        val,
        mode='rel',
        batch_size=conf.batch_size,
        num_workers=conf.num_workers,
        num_gpus=conf.num_gpus)

    detector = RelModel(
        classes=train.ind_to_classes,
        rel_classes=train.ind_to_predicates,
        num_gpus=conf.num_gpus,
        mode=conf.mode,
        require_overlap_det=True,
        use_resnet=conf.use_resnet,
        order=conf.order,
        nl_edge=conf.nl_edge,
        nl_obj=conf.nl_obj,
        hidden_dim=conf.hidden_dim,
        use_proposals=conf.use_proposals,
        pass_in_obj_feats_to_decoder=conf.pass_in_obj_feats_to_decoder,
        pass_in_obj_feats_to_edge=conf.pass_in_obj_feats_to_edge,
        pooling_dim=conf.pooling_dim,
        rec_dropout=conf.rec_dropout,
        use_bias=conf.use_bias,
        use_tanh=conf.use_tanh,
        limit_vision=conf.limit_vision,
        lml_topk=conf.lml_topk,
        lml_softmax=conf.lml_softmax,
        entr_topk=conf.entr_topk,
        ml_loss=conf.ml_loss)

    # Freeze the detector
    for n, param in detector.detector.named_parameters():
        param.requires_grad = False

    print(print_para(detector), flush=True)

    ckpt = torch.load(conf.ckpt)
    if conf.ckpt.split('-')[-2].split('/')[-1] == 'vgrel':
        print("Loading EVERYTHING")
        start_epoch = ckpt['epoch']

        if not optimistic_restore(detector, ckpt['state_dict']):
            start_epoch = -1
            # optimistic_restore(
            #     detector.detector,
            #     torch.load('checkpoints/vgdet/vg-28.tar')['state_dict']
            # )
    else:
        start_epoch = -1
        optimistic_restore(detector.detector, ckpt['state_dict'])

        detector.roi_fmap[1][0].weight.data.copy_(
            ckpt['state_dict']['roi_fmap.0.weight'])
        detector.roi_fmap[1][3].weight.data.copy_(
            ckpt['state_dict']['roi_fmap.3.weight'])
        detector.roi_fmap[1][0].bias.data.copy_(
            ckpt['state_dict']['roi_fmap.0.bias'])
        detector.roi_fmap[1][3].bias.data.copy_(
            ckpt['state_dict']['roi_fmap.3.bias'])

        detector.roi_fmap_obj[0].weight.data.copy_(
            ckpt['state_dict']['roi_fmap.0.weight'])
        detector.roi_fmap_obj[3].weight.data.copy_(
            ckpt['state_dict']['roi_fmap.3.weight'])
        detector.roi_fmap_obj[0].bias.data.copy_(
            ckpt['state_dict']['roi_fmap.0.bias'])
        detector.roi_fmap_obj[3].bias.data.copy_(
            ckpt['state_dict']['roi_fmap.3.bias'])

    detector.cuda()

    print("Training starts now!")
    optimizer, scheduler = get_optim(detector,
                                     conf.lr * conf.num_gpus * conf.batch_size)
    best_eval = None
    for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs):
        rez = train_epoch(epoch, detector, train, train_loader, optimizer,
                          conf, train_f)
        print("overall{:2d}: ({:.3f})\n{}".format(epoch,
                                                  rez.mean(1)['total'],
                                                  rez.mean(1)),
              flush=True)

        mAp = val_epoch(detector, val, val_loader, val_f)
        scheduler.step(mAp)

        if conf.save_dir is not None:
            if best_eval is None or mAp > best_eval:
                torch.save(
                    {
                        'epoch': epoch,
                        'state_dict': detector.state_dict(),
                        # 'optimizer': optimizer.state_dict(),
                    },
                    os.path.join(conf.save_dir, 'best-val.tar'))
                best_eval = mAp
            'pred_rel_inds': rels_i,
            'obj_scores': obj_scores_i,
            'rel_scores': pred_scores_i,  # hack for now.
        }

        evaluator[conf.mode].evaluate_scene_graph_entry(
            gt_entry,
            pred_entry,
        )


print("Training starts now!")
optimizer, scheduler = get_optim(conf.lr * conf.num_gpus * conf.batch_size)
for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs):
    rez = train_epoch(epoch)
    print("overall{:2d}: ({:.3f})\n{}".format(
        epoch, rez.mean(1)['total'], rez.mean(1)), flush=True)
    if conf.save_dir is not None:
        torch.save({
            'epoch': epoch,
            # {k:v for k,v in detector.state_dict().items() if not k.startswith('detector.')},
            'state_dict': detector.state_dict(),
            # 'optimizer': optimizer.state_dict(),
        }, os.path.join(conf.save_dir, '{}-{}.tar'.format('vgrel', epoch)))

    mAp = val_epoch()
    scheduler.step(mAp)
    if any([pg['lr'] <= (conf.lr * conf.num_gpus * conf.batch_size)/99.0 for pg in optimizer.param_groups]):
        print("exiting training early", flush=True)
        break