Beispiel #1
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/media/shenkev/data/Ubuntu/vsepp/data/data',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='coco_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/coco_vse',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        default=True,
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
Beispiel #2
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/A/VSE/data/',
                        help='path to datasets')
    parser.add_argument(
        '--data_name',
        default='resnet152_precomp',
        help='{coco,f8k,f30k,10crop,irv2,resnet152}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.05,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument(
        '--embed_size',
        default=1024,
        type=int,
        help=
        'Dimensionality of the joint embedding. [NOTE: this is used only if <embed_size> differs from <gru_units>]'
    )
    parser.add_argument('--gru_units',
                        default=1024,
                        type=int,
                        help='Number of GRU neurons.')
    parser.add_argument('--grad_clip',
                        default=1.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=2048,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument(
        '--test_measure',
        default=None,
        help=
        'Similarity used for retrieval (None<same used for training>|cosine|order)'
    )
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--text_encoder',
                        default='seam-e',
                        choices=text_encoders.text_encoders_alias.keys())
    parser.add_argument(
        '--att_units',
        default=300,
        type=int,
        help=
        'Number of tanh neurons. When using --att_dim=None we apply a tanh directly to the att input. '
    )
    parser.add_argument('--att_hops',
                        default=30,
                        type=int,
                        help='Number of attention hops (viewpoints).')
    parser.add_argument(
        '--att_coef',
        default=0.,
        type=float,
        help='Influence of Frobenius divergence in the loss function.')

    opt = parser.parse_args()

    if opt.test_measure is None:
        opt.test_measure = opt.measure

    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    tokenizer, vocab_size = data.get_tokenizer(opt.vocab_path, opt.data_name)
    opt.vocab_size = vocab_size

    collate_fn = 'collate_fn'

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, tokenizer,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt, collate_fn)

    # Construct the model
    model = VSE(opt)
    print(model.txt_enc)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
Beispiel #3
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='data',
                        help='path to datasets')
    parser.add_argument('--data_name', default='f30k',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path', default='vocab',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin', default=0.2, type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs', default=30, type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim', default=300, type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size', default=1024, type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip', default=2., type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size', default=224, type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers', default=1, type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate', default=2e-4, type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update', default=15, type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers', default=10, type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step', default=100, type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step', default=500, type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name', default='runs/test',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--img_dim', default=2048, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune', action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--use_restval', action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--reset_train', action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--K', default=2, type=int,help='num of JSR.')
    parser.add_argument('--feature_path', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/trainval/',
                        type=str, help='path to the pre-computed image features')
    parser.add_argument('--region_bbox_file',
                        default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/flickr30k_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5',
                        type=str, help='path to the region_bbox_file(.h5)')
    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(open(os.path.join(
        opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)
    best_rsum = 0
    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
                  .format(opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
        del checkpoint
    # Train the Model

    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint({
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_rsum': best_rsum,
            'opt': opt,
            'Eiters': model.Eiters,
        }, is_best, epoch, prefix=opt.logger_name + '/')
Beispiel #4
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/home/dcsaero01/data/datasets/vsepp/',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='minicsdv2_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=300,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument(
        '--dropout_value',
        default=0,
        type=float,
        help='Probability value for dropout after linear layer')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument(
        '--resume',
        default=
        '/home/dcsaero01/data/projects/vsepp/runs/minicsdv2/checkpoint.pth.tar',
        type=str,
        metavar='PATH',
        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--text_dim',
                        default=6000,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    if opt.data_name == 'coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp':
        vocab = None
        opt.vocab_size = 0
    else:
        vocab = pickle.load(
            open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                 'rb'))
        opt.vocab_size = len(vocab)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, train_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))
    """
Beispiel #5
0
def main():
  # Hyper Parameters

  torch.cuda.set_device(opt.gpu_id)

  tb_logger.configure(opt.logger_name, flush_secs=5)
  logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO, filename=opt.logger_name+'/log.log')
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  formatter = logging.Formatter('%(asctime)s %(message)s')
  console.setFormatter(formatter)
  logging.getLogger('').addHandler(console)

  logging.info(opt)

  # Load Vocabulary Wrapper
  vocab_path = os.path.join(opt.vocab_path, '%s_vocab_total.pkl' % opt.data_name)
  print (vocab_path)
  vocab = pickle.load(open(vocab_path, 'rb'))
  opt.vocab_size = len(vocab)

  # Load data loaders
  train_loader, val_loader = data.get_loaders(
    opt.data_name, vocab, opt.batch_size, opt.workers, opt)

  # Construct the model
  model = VSE(opt)

  print('Print out models:')
  print(model.clip_enc)
  print(model.txt_enc)
  print(model.vid_seq_enc)
  print(model.txt_seq_enc)

  start_epoch = 0
  # optionally resume from a checkpoint
  if opt.resume:
    if os.path.isfile(opt.resume):
      print("=> loading checkpoint '{}'".format(opt.resume))
      checkpoint = torch.load(opt.resume)
      start_epoch = checkpoint['epoch']
      best_rsum = checkpoint['best_rsum']
      model.load_state_dict(checkpoint['model'], opt)
      # Eiters is used to show logs as the continuation of another
      # training
      model.Eiters = checkpoint['Eiters']
      print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
          .format(opt.resume, start_epoch, best_rsum))
      validate(opt, val_loader, model)
      if opt.eval_only:
        return
    else:
      print("=> no checkpoint found at '{}'".format(opt.resume))

  # Train the Model
  best_rsum = 0
  for epoch in range(start_epoch, opt.num_epochs):
    adjust_learning_rate(opt, model.optimizer, epoch)

    # train for one epoch
    train(opt, train_loader, model, epoch, val_loader)

    # evaluate on validation set
    rsum = validate(opt, val_loader, model)

    # remember best R@ sum and save checkpoint
    is_best = rsum > best_rsum
    best_rsum = max(rsum, best_rsum)
    save_checkpoint({
      'epoch': epoch + 1,
      'model': model.state_dict(opt),
      'best_rsum': best_rsum,
      'opt': opt,
      'Eiters': model.Eiters,
    }, is_best, prefix=opt.logger_name + '/', epoch=epoch)
Beispiel #6
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='/data/stud/jorgebjorn/data/',
                        help='path to datasets')
    parser.add_argument('--data_name',
                        default='f8k_precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path',
                        default='/data/stud/jorgebjorn/data/vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument(
        '--logger_name',
        default='/data/stud/jorgebjorn/runs/{}/{}'.format(
            getpass.getuser(),
            datetime.datetime.now().strftime("%d-%m-%y_%H:%M")),
        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--selection',
                        default='uncertainty',
                        help='Active learning selection algorithm')
    parser.add_argument('--primary',
                        default='images',
                        help='Image- or caption-centric active learning')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation',
                        action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=4096,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--device',
                        default=0,
                        type=int,
                        help='which gpu to use')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--no_log',
                        action='store_true',
                        default=False,
                        help='Disable logging')
    opt = parser.parse_args()
    opt.logger_name += "_" + opt.selection + "_" + opt.primary
    print(opt)
    if torch.cuda.is_available():
        torch.cuda.set_device(opt.device)

    # Setup tensorboard logger
    if not opt.no_log:
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.INFO)
        tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)

    # Load data loaders
    active_loader, train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)
    if torch.cuda.is_available():
        model.cuda()

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    n_rounds = 234

    if opt.selection == "uncertainty":
        selection = select_uncertainty
    elif opt.selection == "margin":
        selection = select_margin
    elif opt.selection == "random":
        selection = select_random
    elif opt.selection == "hybrid":
        selection = select_hybrid
    elif opt.selection == "all":
        selection = select_all
    elif opt.selection == "capsim":
        selection = select_captionSimilarity
    else:
        selection = select_uncertainty

    for r in range(n_rounds):
        best_indices = selection(r, model, train_loader)

        for index in best_indices:
            active_loader.dataset.add_single(train_loader.dataset[index][0],
                                             train_loader.dataset[index][1])

        train_loader.dataset.delete_indices(best_indices)

        # Train the Model
        print("Training on {} items ".format(len(active_loader)))

        # Reset the model
        model = VSE(opt)
        if torch.cuda.is_available():
            model.cuda()

        best_rsum = 0
        for epoch in range(opt.num_epochs):
            adjust_learning_rate(opt, model.optimizer, epoch)

            # train for one epoch
            train(opt, active_loader, model, epoch, val_loader)

            # evaluate on validation set
        rsum = validate(opt, val_loader, model, not opt.no_log, r)
Beispiel #7
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default=".", help='path to datasets')
    parser.add_argument(
        '--data_name',
        default='m30k',
        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k|m30k')
    parser.add_argument(
        '--lang',
        default='en',
        help='Which language(s) to use from m30k, en-de, trains on en+de.')
    parser.add_argument('--sentencepair',
                        action='store_true',
                        help='Train caption-caption ranking as well.')
    parser.add_argument(
        '--sentencepair_p',
        default=0.5,
        type=float,
        help='Probability of training on caption-caption and not image-caption.'
    )
    parser.add_argument(
        '--primary',
        default=None,
        help=
        'Which language to monitor for early stopping. Multiple with l1-l2-l3')
    parser.add_argument(
        '--undersample',
        action='store_true',
        help='Pick only one of the 5 possilbe captions for m30k task 2.')
    parser.add_argument('--half',
                        action='store_true',
                        help='Use only half of the M30K from task 2.')
    parser.add_argument('--disaligned',
                        action='store_true',
                        help='Use only half of the M30K from task 2.')
    parser.add_argument('--vocab_path',
                        default='.',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument(
        '--patience',
        default=10,
        type=int,
        help='Number of validation steps to tolerate without improvement.')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim',
                        default=300,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size',
                        default=1024,
                        type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2.,
                        type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers',
                        default=1,
                        type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate',
                        default=.0002,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers',
                        default=10,
                        type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument(
        '--logger_path',
        default='.',
        help='Path where to save the model and Tensorboard log.')
    parser.add_argument(
        '--logger_name',
        help='Name of the folder where to save the model and Tensorboard log.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--sum_violation',
                        dest="max_violation",
                        action='store_false',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--img_dim',
                        default=2048,
                        type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune',
                        action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type',
                        default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval',
                        action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure',
                        default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs',
                        action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm',
                        action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train',
                        action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--seed', default=42, type=int, help='Random seed.')
    opt = parser.parse_args()

    if torch.__version__ >= "0.3":
        opt.reset_train = True

    opt.vocab_path = os.path.join(opt.vocab_path, "vocab")
    if opt.logger_name is None:
        name = "lang{}_half-{}_undersample-{}_disaligned-{}_sentencepair-{}_primary-{}_epochs-{}"
        name = name.format(opt.lang, opt.half, opt.undersample, opt.disaligned,
                           opt.sentencepair, opt.primary, opt.num_epochs)
        opt.logger_name = os.path.join(opt.data_name, name)

    opt.logger_name = os.path.join(opt.logger_path, opt.logger_name,
                                   str(opt.seed))
    print(opt)
    random.seed(rseed + opt.seed)
    np.random.seed(rseed + opt.seed)
    torch.cuda.manual_seed(rseed + opt.seed)
    torch.cuda.manual_seed_all(rseed + opt.seed)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # For multi30k compute vocabulary mappings on the fly.
    if opt.data_name == "m30k":
        vocab = None
        langs = opt.lang.split("-")
    # Load Vocabulary Wrapper for COCO or F30K
    else:
        vocab = pickle.load(
            open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
                 'rb'))
        opt.vocab_size = len(vocab)
        langs = [opt.data_name]
    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)
    # Construct the model
    model = VSE(opt)
    print(model.txt_enc)
    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model, "")
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    if len(langs) == 1 or opt.data_name != 'm30k':

        # Train the Model on a single data set
        best_rsum = 0
        model.train_start()
        for epoch in range(opt.num_epochs):
            if opt.reset_train:
                # Always reset to train mode, this is not the default behavior
                model.train_start()
            adjust_learning_rate(opt, model.optimizer, epoch)

            # train for one epoch
            train(opt, train_loader, model, epoch, val_loader)

            # evaluate on validation set
            rsum = validate(opt, val_loader, model, langs[0])

            # remember best R@ sum and save checkpoint
            is_best = rsum > best_rsum
            best_rsum = max(rsum, best_rsum)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': model.state_dict(),
                    'best_rsum': best_rsum,
                    'opt': opt,
                    'Eiters': model.Eiters,
                },
                is_best,
                prefix=opt.logger_name + '/')
            if is_best:
                patience_count = 0
                print("New best: {}".format(best_rsum))
            else:
                patience_count += 1
                print("No improvement in {}".format(patience_count))
                if patience_count == opt.patience:
                    print("No improvement in {} epochs, stoppin".format(
                        patience_count))
                    break

    else:
        joint_train(opt, train_loader, model, val_loader)
Beispiel #8
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='/w/31/faghri/vsepp_data/',
                        help='path to datasets')
    parser.add_argument('--data_name', default='precomp',
                        help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k')
    parser.add_argument('--vocab_path', default='./vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--margin', default=0.2, type=float,
                        help='Rank loss margin.')
    parser.add_argument('--num_epochs', default=15, type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size', default=128, type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--word_dim', default=300, type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--embed_size', default=1024, type=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip', default=2., type=float,
                        help='Gradient clipping threshold.')
    parser.add_argument('--crop_size', default=224, type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--num_layers', default=1, type=int,
                        help='Number of GRU layers.')
    parser.add_argument('--learning_rate', default=.0002, type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_update', default=8, type=int,
                        help='Number of epochs to update the learning rate.')
    parser.add_argument('--workers', default=10, type=int,
                        help='Number of data loader workers.')
    parser.add_argument('--log_step', default=10, type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step', default=500, type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name', default='runs/runX',
                        help='Path to save the model and Tensorboard log.')
    parser.add_argument('--resume', default='', type=str, metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--max_violation', action='store_true',
                        help='Use max instead of sum in the rank loss.')
    parser.add_argument('--sum_violation', action='store_true')
    parser.add_argument('--img_dim', default=4096, type=int,
                        help='Dimensionality of the image embedding.')
    parser.add_argument('--finetune', action='store_true',
                        help='Fine-tune the image encoder.')
    parser.add_argument('--cnn_type', default='vgg19',
                        help="""The CNN used for image encoder
                        (e.g. vgg19, resnet152)""")
    parser.add_argument('--use_restval', action='store_true',
                        help='Use the restval data for training on MSCOCO.')
    parser.add_argument('--measure', default='cosine',
                        help='Similarity measure used (cosine|order)')
    parser.add_argument('--use_abs', action='store_true',
                        help='Take the absolute value of embedding vectors.')
    parser.add_argument('--no_imgnorm', action='store_true',
                        help='Do not normalize the image embeddings.')
    parser.add_argument('--reset_train', action='store_true',
                        help='Ensure the training is always done in '
                        'train mode (Not recommended).')
    parser.add_argument('--save_all', action='store_true',
                        help="Save model after the training of each epoch")
    parser.add_argument('--memory_bank', action='store_true',
                        help="Train model with memory bank")
    parser.add_argument('--record_val', action='store_true',
                        help="Record the rsum values on validation set in file during training")
    parser.add_argument('--local_alpha', default=30.0, type=float)
    parser.add_argument('--local_ep', default=0.3, type=float)
    parser.add_argument('--global_alpha', default=40.0, type=float)
    parser.add_argument('--global_beta', default=40.0, type=float)
    parser.add_argument('--global_ep_posi', default=0.2, type=float,
                        help="Global epsilon for positive pairs")
    parser.add_argument('--global_ep_nega', default=0.1, type=float,
                        help="Global epsilon for negative pairs")
    parser.add_argument('--mb_k', default=250, type=int,
                        help="Use top K items in memory bank")
    parser.add_argument('--mb_rate', default=0.05, type=float,
                        help="-")

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(open(os.path.join(
        opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
    opt.vocab_size = len(vocab)
    print("Vocab Size: %d" % opt.vocab_size)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(
        opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
                  .format(opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        memory_bank = opt.memory_bank
        if memory_bank and epoch > 0:
            load_memory_bank(opt, train_loader, model)
        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)
        print ("rsum: %.1f" % rsum)
        if opt.record_val:
            with open("rst_val_" + opt.logger_name[5:], "a") as f:
                f.write("Epoch: %d ; rsum: %.1f\n" %(epoch, rsum))

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint({
            'epoch': epoch + 1,
            'model': model.state_dict(),
            'best_rsum': best_rsum,
            'opt': opt,
            'Eiters': model.Eiters,
        }, is_best, prefix=opt.logger_name + '/', save_all=opt.save_all)

        # reset memory bank
        model.mb_img = None
        model.mb_cap = None
def main():
    #训练外参数
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_type', default='rgb', help='path to datasets')
    parser.add_argument('--feature_path', default='', help='path to datasets')
    parser.add_argument('--anno_path', default='', help='path to datasets')
    parser.add_argument('--feature_prefix',
                        default='',
                        help='prefix of feaeture')
    parser.add_argument('--dropout', default=0.5, help='prefix of feature')
    parser.add_argument('--split_video_file',
                        default='',
                        help='prefix of feature')
    parser.add_argument('--num_pos_sample',
                        default=10,
                        help='prefix of feature')
    parser.add_argument('--num_neg_sample',
                        default=10,
                        help='prefix of feature')
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        default=128,
                        type=int,
                        help='Size of a training mini-batch')
    parser.add_argument('--embed_size',
                        default=10240,
                        typr=int,
                        help='Dimensionality of the joint embedding.')
    parser.add_argument('--grad_clip',
                        default=2,
                        type=float,
                        help='Gradient clipping threshold')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate')
    parser.add_argument('--lr_update',
                        default=15,
                        type=int,
                        help='Number of epochs to update the learning rate')
    parser.add_argument('--wrokers',
                        default=10,
                        type=int,
                        help='Number of data loader workers')
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation')
    parser.add_argument('--logger_name',
                        default='runs/runX',
                        help='Path to save the model and Tensorboard log')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint default=none')
    parser.add_argument('--storage_place',
                        default='',
                        type=str,
                        metavar='Path',
                        help='path to latest checkpoint default=none')
    parser.add_argument('--instance_data_path',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to the latest checkpoint')

    opt = parser.parse_args()
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    #Load data loaders
    #1
    train_loader, val_loader = data.get_loaders(opt)

    #Construct the model
    #2
    model = VSE(opt)

    #optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            best_rsum = checkpoint['best_rsum']
            model.load_state_dict(checkpoint['model'])
            #Eiters is used to show logs as the continuation of another
            #training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0

    for epoch in range(opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        #train for one epoch
        #3
        train(opt, train_loader, model, epoch)
        #evaluate on validation set
        rsum = 0

        #remeber best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
            },
            is_best,
            prefix=opt.logger_name + '/')
    #4 val_loader (feat,id,labels)
    eval_feat(val_loader, model, opt.storage_place)
def main():
    parser = argparse.ArgumentParser()
    # Directories.
    parser.add_argument('--data_path',
                        default='/DATA/cvpr19',
                        help='path to datasets')
    parser.add_argument('--vocab_path',
                        default='../vocab/',
                        help='Path to saved vocabulary pickle files.')
    parser.add_argument('--resume',
                        default='',
                        type=str,
                        metavar='PATH',
                        help='path to latest checkpoint (default: none)')
    parser.add_argument('--model',
                        type=str,
                        default='beenburger',
                        help='model_name')
    parser.add_argument(
        '--save_dir',
        type=str,
        default='coco2',
        help='save checkpoint and results in DATA_PATH/MODEL_NAME/SAVE_DIR')
    # Dataset.
    parser.add_argument('--data_name', default='coco', help='{coco|ours}')
    parser.add_argument('--use_restval',
                        default='True',
                        type=str2bool,
                        help='Use the restval data for training on MSCOCO.')
    # Model configurations.
    parser.add_argument('--margin',
                        default=0.2,
                        type=float,
                        help='Rank loss margin.')
    parser.add_argument('--K',
                        default=620,
                        type=int,
                        help='Dimensionality of the word embedding.')
    parser.add_argument('--num_layers',
                        default=4,
                        type=int,
                        help='Number of SRU layers.')
    parser.add_argument('--D',
                        type=int,
                        default='2048',
                        help='dimension of image feature from ResNet')
    parser.add_argument('--D_prime',
                        type=int,
                        default='2400',
                        help='dimension of adaptation + pooling')
    parser.add_argument('--d',
                        type=int,
                        default='2400',
                        help='Dimensionality of the joint embedding')
    # Training configurations.
    parser.add_argument('--num_epochs',
                        default=30,
                        type=int,
                        help='Number of training epochs.')
    parser.add_argument('--batch_size',
                        default=160,
                        type=int,
                        help='Size of a training mini-batch.')
    parser.add_argument('--img_size', default=256, type=int, help='image_size')
    parser.add_argument('--crop_size',
                        default=224,
                        type=int,
                        help='Size of an image crop as the CNN input.')
    parser.add_argument('--learning_rate',
                        default=.001,
                        type=float,
                        help='Initial learning rate.')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=0.5,
                        help='learning rate decay')
    parser.add_argument('--workers',
                        default=4,
                        type=int,
                        help='Number of data loader workers.')
    # Miscellaneous.
    parser.add_argument('--log_step',
                        default=10,
                        type=int,
                        help='Number of steps to print and record the log.')
    parser.add_argument('--val_step',
                        default=500,
                        type=int,
                        help='Number of steps to run validation.')
    parser.add_argument('--logger_name',
                        default='../runs/runX',
                        help='Path to save the model and Tensorboard log.')

    opt = parser.parse_args()
    print(opt)

    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    tb_logger.configure(opt.logger_name, flush_secs=5)

    # Load Vocabulary Wrapper
    vocab = pickle.load(
        open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name),
             'rb'))
    opt.vocab_size = len(vocab)
    opt.vocab = vocab

    # Create directories
    create_directory(opt.data_path, opt.model, opt.save_dir)

    # Load data loaders
    train_loader, val_loader = data.get_loaders(opt.data_name, vocab,
                                                opt.crop_size, opt.batch_size,
                                                opt.workers, opt)

    # Construct the model
    model = VSE(opt)

    # optionally resume from a checkpoint
    if opt.resume:
        if os.path.isfile(opt.resume):
            print("=> loading checkpoint '{}'".format(opt.resume))
            checkpoint = torch.load(opt.resume)
            start_epoch = checkpoint['epoch']
            model.change_training_state(0)
            if start_epoch > 2:
                model.change_training_state(2)
            if start_epoch > 8:
                model.change_training_state(8)
            best_rsum = checkpoint['best_rsum']
            model.optimizer = checkpoint['optim']
            model.load_state_dict(checkpoint['model'])
            # Eiters is used to show logs as the continuation of another
            # training
            model.Eiters = checkpoint['Eiters']
            print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(
                opt.resume, start_epoch, best_rsum))
            validate(opt, val_loader, model)
        else:
            print("=> no checkpoint found at '{}'".format(opt.resume))

    # Train the Model
    best_rsum = 0
    start_epoch = 0
    for epoch in range(start_epoch, opt.num_epochs):
        adjust_learning_rate(opt, model.optimizer, epoch)

        model.change_training_state(epoch)

        # train for one epoch
        train(opt, train_loader, model, epoch, val_loader)

        # evaluate on validation set
        rsum = validate(opt, val_loader, model)

        # remember best R@ sum and save checkpoint
        is_best = rsum > best_rsum
        best_rsum = max(rsum, best_rsum)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': model.state_dict(),
                'best_rsum': best_rsum,
                'opt': opt,
                'Eiters': model.Eiters,
                'optim': model.optimizer,
            },
            is_best,
            prefix=os.path.join(opt.data_path, opt.model, opt.save_dir))
Beispiel #11
0
def main():
  # Hyper Parameters
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_path', default='/data1/hexianghu/activitynet/captions/',
            help='path to datasets')
  parser.add_argument('--data_name', default='anet_precomp',
            help='anet_precomp')
  parser.add_argument('--vocab_path', default='./vocab/',
            help='Path to saved vocabulary pickle files.')
  parser.add_argument('--margin', default=0.2, type=float,
            help='Rank loss margin.')
  parser.add_argument('--num_epochs', default=50, type=int,
            help='Number of training epochs.')
  parser.add_argument('--batch_size', default=64, type=int,
            help='Size of a training mini-batch.')
  parser.add_argument('--word_dim', default=300, type=int,
            help='Dimensionality of the word embedding.')
  parser.add_argument('--embed_size', default=1024, type=int,
            help='Dimensionality of the joint embedding.')
  parser.add_argument('--grad_clip', default=0., type=float,
            help='Gradient clipping threshold.')
  parser.add_argument('--num_layers', default=1, type=int,
            help='Number of GRU layers.')
  parser.add_argument('--learning_rate', default=.001, type=float,
            help='Initial learning rate.')
  parser.add_argument('--lr_update', default=10, type=int,
            help='Number of epochs to update the learning rate.')
  parser.add_argument('--workers', default=10, type=int,
            help='Number of data loader workers.')
  parser.add_argument('--log_step', default=10, type=int,
            help='Number of steps to print and record the log.')
  parser.add_argument('--val_step', default=500, type=int,
            help='Number of steps to run validation.')
  parser.add_argument('--logger_name', default='runs/runX',
            help='Path to save the model and Tensorboard log.')
  parser.add_argument('--resume', default='', type=str, metavar='PATH', required=True,
            help='path to latest checkpoint (default: none)')
  parser.add_argument('--max_violation', action='store_true',
            help='Use max instead of sum in the rank loss.')
  parser.add_argument('--img_dim', default=500, type=int,
            help='Dimensionality of the image embedding.')
  parser.add_argument('--measure', default='cosine',
            help='Similarity measure used (cosine|order)')
  parser.add_argument('--use_abs', action='store_true',
            help='Take the absolute value of embedding vectors.')
  parser.add_argument('--no_imgnorm', action='store_true',
            help='Do not normalize the image embeddings.')
  parser.add_argument('--gpu_id', default=0, type=int,
            help='GPU to use.')
  parser.add_argument('--rnn_type', default='maxout', choices=['maxout', 'seq2seq', 'attention'],
            help='Type of recurrent model.')
  parser.add_argument('--img_first_size', default=1024, type=int,
            help='first img layer emb size')
  parser.add_argument('--cap_first_size', default=1024, type=int,
            help='first cap layer emb size')
  parser.add_argument('--img_first_dropout', default=0, type=float,
            help='first img layer emb size')
  parser.add_argument('--cap_first_dropout', default=0, type=float,
            help='first cap layer emb size')
 
  opt = parser.parse_args()
  print(opt)

  torch.cuda.set_device(opt.gpu_id)

  logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
  tb_logger.configure(opt.logger_name, flush_secs=5)

  # Load Vocabulary Wrapper
  vocab = pickle.load(open(os.path.join(
    opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb'))
  opt.vocab_size = len(vocab)

  # Load data loaders
  train_loader, val_loader = data.get_loaders(
    opt.data_name, vocab, opt.batch_size, opt.workers, opt)

  # Construct the model
  model = VSE(opt)

  print('Print out models:')
  print(model.img_enc)
  print(model.txt_enc)
  print(model.img_seq_enc)
  print(model.txt_seq_enc)

  # optionally resume from a checkpoint
  if os.path.isfile(opt.resume):
    print("=> loading checkpoint '{}'".format(opt.resume))
    checkpoint = torch.load(opt.resume)
    start_epoch = checkpoint['epoch']
    best_rsum = checkpoint['best_rsum']
    model.load_state_dict(checkpoint['model'])
    # Eiters is used to show logs as the continuation of another
    # training
    model.Eiters = checkpoint['Eiters']
    print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})"
        .format(opt.resume, start_epoch, best_rsum))
    validate(opt, val_loader, model)
  else:
    print("=> no checkpoint found at '{}'".format(opt.resume))