Exemple #1
0
def generate(**kwargs):

    device = t.device('cuda') if t.cuda.is_available else t.device('cpu')
    for k, v in kwargs.items():
        setattr(opt, k, v)

    data = t.load(opt.caption_path, map_location=lambda s, l: s)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    transforms = tv.transforms.Compose([
        tv.transforms.Resize(224),
        tv.transforms.CenterCrop(224),
        tv.transforms.ToTensor(),
        tv.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x: x
    # resnet50 = resnet50.to(device)
    # img = img.to(device)
    img_feats = resnet50(img).detach()

    # Caption Model
    model = CaptionModel(opt, word2ix, ix2word)
    model.load_state_dict(t.load(opt.model_path, map_location='cpu'))
    # model.to(device)

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Exemple #2
0
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)
    device=t.device('cuda') if opt.use_gpu else t.device('cpu')

    # 数据预处理
    data = t.load(opt.caption_data_path, map_location=lambda s, l: s)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
        tv.transforms.Resize(opt.scale_size),
        tv.transforms.CenterCrop(opt.img_size),
        tv.transforms.ToTensor(),
        normalize
    ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x: x
    resnet50.to(device)
    img = img.to(device)
    img_feats = resnet50(img).detach()

    # Caption模型
    model = CaptionModel(opt, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    model.to(device)

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Exemple #3
0
def train(**kwargs):

    device = t.device('cuda') if t.cuda.is_available() else t.device('cpu')
    for k, v in kwargs.items():
        setattr(opt, k, v)

    dataloader = get_dataloader(opt)
    model = CaptionModel(opt, dataloader.dataset.word2ix,
                         dataloader.dataset.id2ix)
    if opt.model_path:
        model.load_state_dict(t.load(opt.model_path, map_location='cpu'))
    t.backends.cudnn.enabled = False
    model = model.to(device)

    optimizer = Adam(model.parameters(), opt.lr)
    criterion = t.nn.CrossEntropyLoss()
    for epoch in range(opt.max_epoch):
        for ii, (imgs, (captions, lengths),
                 indexes) in tqdm.tqdm(enumerate(dataloader)):

            imgs = Variable(imgs).to(device)
            captions = Variable(captions).to(device)
            pred, _ = model(imgs, captions, lengths)
            target_captions = pack_padded_sequence(captions, lengths)[0]

            loss = criterion(pred, target_captions)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Current Loss: ", loss.item())
        if (epoch + 1) % opt.save_model == 0:
            t.save(model.state_dict(), "checkpoints/{}.pth".format(epoch))
Exemple #4
0
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)
    device = t.device('cuda') if opt.use_gpu else t.device('cpu')

    # 数据预处理
    data = t.load(opt.caption_data_path, map_location=lambda s, l: s)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
        tv.transforms.Resize(opt.scale_size),
        tv.transforms.CenterCrop(opt.img_size),
        tv.transforms.ToTensor(), normalize
    ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x: x
    resnet50.to(device)
    img = img.to(device)
    img_feats = resnet50(img).detach()

    # Caption模型
    model = CaptionModel(opt, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    model.to(device)

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Exemple #5
0
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    # 数据预处理
    data = t.load(opt.caption_data_path)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    test_datas = t.load('test_results2.pth')
    imgs = t.load('test_imgs.pth')

    # Caption模型
    model = CaptionModel(opt, None, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    model.cuda()

    results = []
    for ii, (img_feat, img_id) in tqdm.tqdm(enumerate(zip(test_datas, imgs))):
        sentences = model.generate(img_feat)
        item = {
            'image_id': img_id.replace('.jpg', ''),
            'caption': sentences[0].replace('</EOS>', '')
        }
        results.append(item)
        if ii % 1000 == 0: print sentences[0]
    import json
    with open('submit.json', 'w') as f:
        json.dump(results, f)
Exemple #6
0
def generate(**kwargs):
    opt = Config()
    for k,v in kwargs.items():
        setattr(opt,k,v)
    
    # 数据预处理
    data = t.load(opt.caption_data_path,map_location=lambda s,l:s)
    word2ix,ix2word = data['word2ix'],data['ix2word']

    IMAGENET_MEAN =  [0.485, 0.456, 0.406]
    IMAGENET_STD =  [0.229, 0.224, 0.225]
    normalize =  tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
                tv.transforms.Scale(opt.scale_size),
                tv.transforms.CenterCrop(opt.img_size),
                tv.transforms.ToTensor(),
                normalize
        ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x:x
    if opt.use_gpu:
        resnet50.cuda() 
        img = img.cuda()
    img_feats = resnet50(Variable(img,volatile=True))

    # Caption模型
    model = CaptionModel(opt,word2ix,ix2word)
    model = model.load(opt.model_ckpt).eval()
    if opt.use_gpu:
         model.cuda()

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Exemple #7
0
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    # 数据预处理
    data = t.load(opt.caption_data_path, map_location=lambda s, l: s)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    IMAGENET_MEAN = [0.485, 0.456, 0.406]
    IMAGENET_STD = [0.229, 0.224, 0.225]
    normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
        tv.transforms.Resize(opt.scale_size),
        tv.transforms.CenterCrop(opt.img_size),
        tv.transforms.ToTensor(),
        normalize,
    ])

    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x: x
    if opt.use_gpu:
        resnet50.cuda()
        img = img.cuda()
    with t.no_grad():
        img_feats = resnet50(Variable(img))

    # Caption模型
    model = CaptionModel(opt, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    if opt.use_gpu:
        model.cuda()

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Exemple #8
0
def run_inference(config, curr_ckpt_path):
    """
    Main inference function. Builds and executes the model.
    """

    ckpt_dir, ckpt_file = os.path.split(curr_ckpt_path)
    ckpt_num = P_CKPT.findall(ckpt_file)[0]  # Checkpoint number

    # Setup input pipeline & Build model
    print('TensorFlow version: r{}'.format(tf.__version__))
    g = tf.Graph()
    with g.as_default():
        tf.set_random_seed(config.rand_seed)
        inputs_man = inputs.InputManager(config, is_inference=True)
        c = inputs_man.config
        batch_size = c.batch_size_infer

        with tf.name_scope('infer'):
            m_infer = CaptionModel(c,
                                   mode='infer',
                                   batch_ops=inputs_man.batch_infer,
                                   reuse=False,
                                   name='inference')
        init_fn = tf.local_variables_initializer()
        saver = tf.train.Saver()

    filenames = inputs_man.filenames_infer
    r = config.per_process_gpu_memory_fraction
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=r)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options), graph=g)
    num_batches = int(c.split_sizes['infer'] / batch_size)

    raw_outputs = dict(captions={},
                       attention={},
                       image_ids={},
                       beam_size=c.infer_beam_size,
                       max_caption_length=c.infer_max_length,
                       checkpoint_path=curr_ckpt_path,
                       checkpoint_number=ckpt_num)
    coco_json = []
    with sess:
        sess.run(init_fn)
        # Restore model from checkpoint
        saver.restore(sess, curr_ckpt_path)
        g.finalize()

        print("INFO: Graph constructed. Starting inference.")
        start_time = time.time()

        desc = 'Inference: checkpoint {}'.format(ckpt_num)
        for step in tqdm(range(num_batches), desc=desc, ncols=100):
            word_ids, attn_maps = sess.run(m_infer.infer_output)
            captions = id_to_caption(word_ids, c)
            #attn_maps = np.split(attn_maps, batch_size)

            # Get image ids, compile results
            batch_start = step * batch_size
            batch_end = (step + 1) * batch_size
            batch_filenames = filenames[batch_start:batch_end]

            for i, f in enumerate(batch_filenames):
                image_id = f.replace('.jpg', '')
                image_id = P_COCO.findall(image_id)
                if len(image_id) > 0:
                    image_id = int(image_id[0])
                else:
                    image_id = int(image_id)
                raw_outputs['captions'][f] = captions[i]
                #if c.infer_beam_size == 1:
                raw_outputs['attention'][f] = attn_maps[i]
                raw_outputs['image_ids'][f] = image_id
                coco_json.append(
                    dict(image_id=image_id, caption=unicode(captions[i])))

        print("\nExample captions:\n{}\n".format("\n".join(captions[:3])))
        t = time.time() - start_time
        sess.close()

    # Ensure correctness
    assert len(filenames) == len(list(set(filenames)))
    assert len(filenames) == len(coco_json)
    assert len(filenames) == len(raw_outputs['image_ids'].keys())

    # Dump output files
    raw_output_fname = 'outputs___{}.pkl'.format(ckpt_num)
    coco_json_fname = 'captions___{}.json'.format(ckpt_num)

    # Captions with attention maps
    if c.save_attention_maps:
        with open(pjoin(c.infer_save_path, raw_output_fname), 'wb') as f:
            pickle.dump(raw_outputs, f, pickle.HIGHEST_PROTOCOL)
    # Captions with image ids
    with open(pjoin(c.infer_save_path, coco_json_fname), 'w') as f:
        json.dump(coco_json, f)
    if not os.path.isfile(pjoin(c.infer_save_path, 'infer_speed.txt')):
        out = [
            'Using GPU #: {}'.format(c.gpu),
            'Inference batch size: {}'.format(c.batch_size_infer),
            'Inference beam size: {}'.format(c.infer_beam_size), ''
        ]
        with open(pjoin(c.infer_save_path, 'infer_speed.txt'), 'a') as f:
            f.write('\r\n'.join(out))
    with open(pjoin(c.infer_save_path, 'infer_speed.txt'), 'a') as f:
        f.write('\r\n{}'.format(len(filenames) / t))
    print("\nINFO: Inference completed. Time taken: {:4.2f} mins\n".format(t /
                                                                           60))
def main(hf,
         f_type,
         capl=16,
         d_w2v=512,
         output_dim=512,
         feature_shape=None,
         lr=0.01,
         batch_size=64,
         total_epoch=100,
         file=None,
         pretrained_model=None):
    '''
		capl: the length of caption
	'''

    # Create vocabulary
    v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(
        file, capl=capl, v2i={
            '': 0,
            'UNK': 1,
            'BOS': 2,
            'EOS': 3
        })

    i2v = {i: v for v, i in v2i.items()}

    print('building model ...')
    voc_size = len(v2i)

    input_video = tf.placeholder(tf.float32,
                                 shape=(None, ) + feature_shape,
                                 name='input_video')
    input_captions = tf.placeholder(tf.int32,
                                    shape=(None, capl),
                                    name='input_captions')
    y = tf.placeholder(tf.int32, shape=(None, capl, len(v2i)))

    captionModel = CaptionModel.CaptionModel(input_video, input_captions,
                                             voc_size, d_w2v, output_dim)
    predict_score, predict_words = captionModel.build_model()
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=y,
                                                   logits=predict_score)
    loss = tf.reduce_mean(loss) + sum(
        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    optimizer = tf.train.RMSPropOptimizer(lr,
                                          decay=0.9,
                                          momentum=0.0,
                                          epsilon=1e-8)
    train = optimizer.minimize(loss)
    '''
		configure && runtime environment
	'''
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    config.log_device_placement = False

    sess = tf.Session(config=config)

    init = tf.global_variables_initializer()
    sess.run(init)

    with sess.as_default():
        saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
        if pretrained_model is not None:
            saver.restore(sess, pretrained_model)
            print('restore pre trained file:' + pretrained_model)

        for epoch in xrange(total_epoch):
            # # shuffle
            print('Epoch: %d/%d, Batch_size: %d' %
                  (epoch + 1, total_epoch, batch_size))
            # # train phase
            tic = time.time()
            total_loss = exe_train(sess,
                                   train_data,
                                   batch_size,
                                   v2i,
                                   hf,
                                   feature_shape,
                                   train,
                                   loss,
                                   input_video,
                                   input_captions,
                                   y,
                                   capl=capl)

            print('    --Train--, Loss: %.5f, .......Time:%.3f' %
                  (total_loss, time.time() - tic))

            tic = time.time()
            js = exe_test(sess,
                          test_data,
                          batch_size,
                          v2i,
                          i2v,
                          hf,
                          feature_shape,
                          predict_words,
                          input_video,
                          input_captions,
                          y,
                          capl=capl)
            print('    --Val--, .......Time:%.3f' % (time.time() - tic))

            #save model
            export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s' + '_' + f_type + '/' + 'lr' + str(
                lr) + '_f' + str(feature_shape[0])
            if not os.path.exists(export_path + '/model'):
                os.makedirs(export_path + '/model')
                print('mkdir %s' % export_path + '/model')
            if not os.path.exists(export_path + '/res'):
                os.makedirs(export_path + '/res')
                print('mkdir %s' % export_path + '/res')

            # eval
            res_path = export_path + '/res/' + f_type + '_E' + str(epoch +
                                                                   1) + '.json'
            evaluate_mode_by_shell(res_path, js)

            save_path = saver.save(
                sess, export_path + '/model/' + 'E' + str(epoch + 1) + '_L' +
                str(total_loss) + '.ckpt')
            print("Model saved in file: %s" % save_path)
Exemple #10
0
        'num_chunks': opt.num_chunks,
        'mode': 'test'
    }

    train_loader = DataLoader(train_opt)
    val_loader = DataLoader(val_opt)
    test_loader = DataLoader(test_opt)

    opt.vocab = train_loader.get_vocab()
    opt.vocab_size = train_loader.get_vocab_size()
    opt.seq_length = train_loader.get_seq_length()
    opt.feat_dims = train_loader.get_feat_dims()
    opt.history_file = opt.model_file.replace('.pth', '_history.json', 1)

    logger.info('Building model...')
    model = CaptionModel(opt)

    xe_criterion = CrossEntropyCriterion()
    rl_criterion = RewardCriterion()

    if torch.cuda.is_available():
        model.cuda()
        xe_criterion.cuda()
        rl_criterion.cuda()

    logger.info('Start training...')
    start = datetime.now()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)
    infos = train(model,
                  xe_criterion,
Exemple #11
0
    test_loader = DataLoader(test_opt)

    logger.info('Loading model: %s', opt.model_file)
    checkpoint = torch.load(opt.model_file)
    checkpoint_opt = checkpoint['opt']

    opt.model_type = checkpoint_opt.model_type
    opt.vocab = checkpoint_opt.vocab
    opt.vocab_size = checkpoint_opt.vocab_size
    opt.seq_length = checkpoint_opt.seq_length
    opt.feat_dims = checkpoint_opt.feat_dims

    assert opt.vocab_size == test_loader.get_vocab_size()
    assert opt.seq_length == test_loader.get_seq_length()
    assert opt.feat_dims == test_loader.get_feat_dims()

    logger.info('Building model...')
    model = CaptionModel(opt)
    logger.info('Loading state from the checkpoint...')
    model.load_state_dict(checkpoint['model'])

    xe_criterion = CrossEntropyCriterion()

    model.cuda()
    xe_criterion.cuda()

    logger.info('Start testing...')
    test(model, xe_criterion, test_loader, opt)
    logger.info('Time: %s', datetime.now() - start)
    test_loader.close()
Exemple #12
0
def train(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)
    device=t.device('cuda') if opt.use_gpu else t.device('cpu')

    opt.caption_data_path = 'caption.pth'  # 原始数据
    opt.test_img = ''  # 输入图片
    # opt.model_ckpt='caption_0914_1947' # 预训练的模型

    # 数据
    vis = Visualizer(env=opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # 模型
    model = CaptionModel(opt, word2ix, ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)
    optimizer = model.get_optimizer(opt.lr)
    criterion = t.nn.CrossEntropyLoss()
   
    model.to(device)

    # 统计
    loss_meter = meter.AverageValueMeter()

    for epoch in range(opt.epoch):
        loss_meter.reset()
        for ii, (imgs, (captions, lengths), indexes) in tqdm.tqdm(enumerate(dataloader)):
            # 训练
            optimizer.zero_grad()
            imgs = imgs.to(device)
            captions = captions.to(device)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]
            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            optimizer.step()
            loss_meter.add(loss.item())

            # 可视化
            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss', loss_meter.value()[0])

                # 可视化原始图片 + 可视化人工的描述语句
                raw_img = _data['ix2id'][indexes[0]]
                img_path = opt.img_path + raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)

                raw_caption = captions.data[:, 0]
                raw_caption = ''.join([_data['ix2word'][ii] for ii in raw_caption])
                vis.text(raw_caption, u'raw_caption')
                vis.img('raw', raw_img, caption=raw_caption)

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results), u'caption')
        model.save()
Exemple #13
0
            'Process the train data with generate_data.py and extract_features.py',
            'first to get "model_info.json", "train_descriptions.pkl" and "train.pkl".'
        )

    embedding_matrix = get_embeddings(root_path, model_info['vocab_size'],
                                      embedding_dim, model_info['wordtoidx'])

    train_dataset = SampleDataset(train_descriptions, train_img_features,
                                  model_info['wordtoidx'],
                                  model_info['max_length'])

    train_loader = DataLoader(train_dataset, batch_size, collate_fn=my_collate)

    caption_model = CaptionModel(model_info['vocab_size'],
                                 embedding_dim,
                                 hidden_size=hidden_size,
                                 embedding_matrix=embedding_matrix,
                                 embedding_train=True)

    init_weights(caption_model, embedding_pretrained=True)

    caption_model.to(device)

    # we will ignore the pad token in true target set
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    optimizer = torch.optim.Adam(caption_model.parameters(), lr=0.01)

    clip = 1
    start = time()
    print(f'Training...')
                                   batch_size=batch_size,
                                   max_length=max_length,
                                   shuffle=True,
                                   num_workers=workers)
    print("trainloader ok")

    valDataLoader = get_iterator(valDset,
                                 vocab,
                                 batch_size=eval_batch_size,
                                 max_length=max_length,
                                 shuffle=False,
                                 num_workers=workers)

    model = CaptionModel(cnn,
                         vocab,
                         embedding_size=embedding_size,
                         rnn_size=rnn_size,
                         num_layers=num_layers,
                         share_embedding_weights=share_weights)

    if 'cuda' in type:
        cudnn.benchmark = True
        model.cuda()

    optimizer = select_optimizer(optimizer,
                                 params=model.parameters(),
                                 lr=learning_rate)
    regime = lambda e: {
        'lr': learning_rate * (lr_decay**e),
        'momentum': momentum,
        'weight_decay': weight_decay
    }
def main(hf,f_type,capl=16, d_w2v=512, output_dim=512,
		feature_shape=None,unsup_training_feature_shape=None,
		lr=0.01,
		batch_size=64,total_epoch=100,unsup_epoch=None,
		file=None,pretrained_model=None):
	'''
		capl: the length of caption
	'''

	# Create vocabulary
	v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(file, capl=capl, v2i={'': 0, 'UNK':1,'BOS':2, 'EOS':3})

	i2v = {i:v for v,i in v2i.items()}

	print('building model ...')
	voc_size = len(v2i)
	input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
	input_captions = tf.placeholder(tf.int32, shape=(None,capl), name='input_captions')
	y = tf.placeholder(tf.int32,shape=(None, capl))

	unsup_input_video = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='unsup_input_video')
	unsup_decoder_feature = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='unsup_decoder_feature')
	true_video = tf.placeholder(tf.float32, shape=(None,)+(40,2048),name='true_video')


	#
	#
	attentionCaptionModel = CaptionModel.UnsupTrainingAttentionCaptionModel(input_video, input_captions, unsup_input_video, 
															unsup_decoder_feature, voc_size, d_w2v, output_dim,
															T_k=[1,2,4,8])
	predict_score, predict_words, predict_vector = attentionCaptionModel.build_model()
	
	huber_Loss = Losses.Huber_Loss(predict_vector, true_video)
	unsup_training_loss = huber_Loss.build()
	print('unsup_training_loss.get_shape().as_list()',unsup_training_loss.get_shape().as_list())
	unsup_training_loss = tf.reduce_mean(tf.reduce_sum(unsup_training_loss,axis=[1,2])+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)))
	optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam')
	gvs = optimizer.compute_gradients(unsup_training_loss)
	capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in gvs ]
	unsup_training = optimizer.apply_gradients(capped_gvs)


	caption_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=predict_score)+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
	caption_loss = tf.reduce_mean(caption_loss)#

	caption_gvs = optimizer.compute_gradients(caption_loss)
	caption_capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in caption_gvs ]
	caption_training = optimizer.apply_gradients(caption_capped_gvs)

	# caption_training = optimizer.minimize(caption_loss)
	# 

	'''
		configure && runtime environment
	'''
	config = tf.ConfigProto()
	config.gpu_options.per_process_gpu_memory_fraction = 0.3
	# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
	config.log_device_placement=False

	sess = tf.Session(config=config)

	init = tf.global_variables_initializer()
	sess.run(init)

	with sess.as_default():
		saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
		if pretrained_model is not None:
			saver.restore(sess, pretrained_model)
			print('restore pre trained file:' + pretrained_model)


		export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size)
		
		# #unsupervised training 
		for epoch in xrange(unsup_epoch):
			print('Unsupervised Epoch: %d/%d, Batch_size: %d' %(epoch+1,unsup_epoch,batch_size))
			# # train phase
			tic = time.time()
			total_loss = exe_unsup_train(sess, train_data, batch_size, v2i, hf, unsup_training_feature_shape, unsup_training, unsup_training_loss, unsup_input_video, unsup_decoder_feature, true_video,capl=capl)

			print('    --Unsupervised Training--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic))
			tic = time.time()
			total_loss = exe_unsup_test(sess, test_data, batch_size, v2i, hf, unsup_training_feature_shape, unsup_training_loss, unsup_input_video, unsup_decoder_feature, true_video,capl=capl)
			print('    --Unsupervised Testing--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic))

			if not os.path.exists(export_path+'/unsupervised'):
				os.makedirs(export_path+'/unsupervised')
				print('mkdir %s' %export_path+'/unsupervised')
			save_path = saver.save(sess, export_path+'/unsupervised/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt')

		for epoch in xrange(total_epoch):
			# # shuffle
			

			# if epoch % 5==0:
				
			# train phase
			print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
			tic = time.time()
			total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, caption_training, caption_loss, input_video, input_captions, y,capl=capl)

			print('    --Train--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic))

			tic = time.time()
			js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape, 
										predict_words, input_video, input_captions, y, capl=capl)
			print('    --Val--, .......Time:%.3f' %(time.time()-tic))

			

			#save model
			# export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s'+'_'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size)
			if not os.path.exists(export_path+'/model'):
				os.makedirs(export_path+'/model')
				print('mkdir %s' %export_path+'/model')
			if not os.path.exists(export_path+'/res'):
				os.makedirs(export_path+'/res')
				print('mkdir %s' %export_path+'/res')

			# eval
			res_path = export_path+'/res/'+f_type+'_E'+str(epoch+1)+'.json'
			evaluate_mode_by_shell(res_path,js)


			save_path = saver.save(sess, export_path+'/model/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt')
			print("Model saved in file: %s" % save_path)
Exemple #16
0
IMAGENET_STD =  [0.229, 0.224, 0.225]
normalize =  tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD)
transforms = tv.transforms.Compose([
            tv.transforms.Scale(opt.scale_size),
            tv.transforms.CenterCrop(opt.img_size),
            tv.transforms.ToTensor(),
            normalize
    ])
img_ = Image.open(opt.test_img)
img = transforms(img_).unsqueeze(0)
img_.resize((int(img_.width*256/img_.height),256))

# 用resnet50来提取图片特征
# 如果本地没有预训练的模型文件,会自动下载
resnet50 = tv.models.resnet50(True).eval()
del resnet50.fc
resnet50.fc = lambda x:x
if opt.use_gpu:
    resnet50.cuda()
    img = img.cuda()
img_feats = resnet50(Variable(img,volatile=True))


# Caption模型
model = CaptionModel(opt,word2ix,ix2word)
model = model.load(opt.model_ckpt).eval()
if opt.use_gpu:
     model.cuda()

results = model.generate(img_feats.data[0])
print('\r\n'.join(results))
def main(unused_argv):
    # Check the supplied arguments
    if len(unused_argv) != 1:
        raise Exception("There is a problem with how you entered flags: %s" % unused_argv)
    if not FLAGS.experiment_name:
        raise Exception("You need to specify --experiment_name")
    if not FLAGS.ckpt_load_dir and FLAGS.mode == "eval":
        raise Exception("You need to specify a directory to load the checkpoint for eval")
    if (not FLAGS.data_source) or (FLAGS.data_source != "ssd" and FLAGS.data_source != "ram"):
        raise Exception("You need to specify how to load data. Choose from ram and ssd.")

    FLAGS.MAIN_DIR = os.path.dirname(os.path.abspath(__file__))   # Absolute path of the directory containing main.py
    FLAGS.DATA_DIR = os.path.join(FLAGS.MAIN_DIR, "data")   # Absolute path of the data/ directory
    FLAGS.EXPERIMENTS_DIR = os.path.join(FLAGS.MAIN_DIR, "experiments")   # Absolute path of the experiments/ directory
    FLAGS.train_dir = os.path.join(FLAGS.EXPERIMENTS_DIR, FLAGS.experiment_name)
    FLAGS.bestmodel_dir = os.path.join(FLAGS.train_dir, "best_checkpoint")
    FLAGS.train_res_dir = os.path.join(FLAGS.train_dir, "myCaptions.json")  # Store the prediction results (for evaluation) during training

    FLAGS.glove_path = os.path.join(FLAGS.MAIN_DIR, "glove.6B.300d.trimmed.txt")
    FLAGS.goldAnn_train_dir = os.path.join(FLAGS.MAIN_DIR, "coco/annotations/captions_train2014.json")
    FLAGS.goldAnn_val_dir = os.path.join(FLAGS.MAIN_DIR, "coco/annotations/captions_val2014.json")

    # Load embedding matrix and vocab mappings
    random_init = (FLAGS.special_token == "train")
    emb_matrix, word2id, id2word = get_glove(FLAGS.glove_path, 300, random_init=random_init)

    # Initialize model
    caption_model = CaptionModel(FLAGS, id2word, word2id, emb_matrix)

    # Some GPU settings
    config=tf.ConfigProto()
    config.gpu_options.allow_growth = True

    ####################################################################################
    ####################################################################################

    if FLAGS.mode == "train":
        # Setup train dir and logfile
        if not os.path.exists(FLAGS.train_dir):
            os.makedirs(FLAGS.train_dir)
        file_handler = logging.FileHandler(os.path.join(FLAGS.train_dir, "log.txt"))
        logging.getLogger().addHandler(file_handler)

        # Make bestmodel dir if necessary
        if not os.path.exists(FLAGS.bestmodel_dir):
            os.makedirs(FLAGS.bestmodel_dir)

        with tf.Session(config=config) as sess:
            initialize_model(sess, caption_model, FLAGS.train_dir, expect_exists=False)  # Load most recent model
            caption_model.train(sess)

    ####################################################################################
    ####################################################################################

    # Sample evaluation command: python main.py --mode=eval --experiment_name=baseline --ckpt_load_dir=./experiments/baseline/best_checkpoint
    elif FLAGS.mode == "eval":
        print("Starting official evaluation...")
        with tf.Session(config=config) as sess:
            initialize_model(sess, caption_model, FLAGS.ckpt_load_dir, expect_exists=True)
            scores = caption_model.check_metric(sess, mode='val', num_samples=0)
            # Replace mode with 'test' if want to evaluate on test set
            for metric_name, metric_score in scores.items():
                print("{}: {}".format(metric_name, metric_score))

    else:
        raise Exception("Unexpected value of FLAGS.mode: %s" % FLAGS.mode)
Exemple #18
0
def train(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    vis = Visualizer(env=opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # cnn = tv.models.resnet50(True)
    model = CaptionModel(opt, None, word2ix, ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)

    optimizer = model.get_optimizer(opt.lr1)
    criterion = t.nn.CrossEntropyLoss()

    model.cuda()
    criterion.cuda()

    loss_meter = meter.AverageValueMeter()
    perplexity = meter.AverageValueMeter()

    for epoch in range(opt.epoch):

        loss_meter.reset()
        perplexity.reset()
        for ii, (imgs, (captions, lengths),
                 indexes) in tqdm.tqdm(enumerate(dataloader)):
            optimizer.zero_grad()
            input_captions = captions[:-1]
            imgs = imgs.cuda()
            captions = captions.cuda()

            imgs = Variable(imgs)
            captions = Variable(captions)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]

            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            # clip_grad_norm(model.rnn.parameters(),opt.grad_clip)
            optimizer.step()
            loss_meter.add(loss.data[0])
            perplexity.add(t.exp(loss.data)[0])

            # 可视化
            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss', loss_meter.value()[0])
                vis.plot('perplexity', perplexity.value()[0])

                # 可视化原始图片

                raw_img = _data['train']['ix2id'][indexes[0]]
                img_path = '/data/image/ai_cha/caption/ai_challenger_caption_train_20170902/caption_train_images_20170902/' + raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)
                vis.img('raw', raw_img)

                # raw_img = (imgs.data[0]*0.25+0.45).clamp(max=1,min=0)
                # vis.img('raw',raw_img)

                # 可视化人工的描述语句
                raw_caption = captions.data[:, 0]
                raw_caption = ''.join(
                    [_data['ix2word'][ii] for ii in raw_caption])
                vis.text(raw_caption, u'raw_caption')

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results), u'caption')
        if (epoch + 1) % 100 == 0:
            model.save()
Exemple #19
0
def train(**kwargs):
    opt = Config()
    opt.caption_data_path = 'caption.pth'  # 原始数据
    opt.test_img = ''  # 输入图片
    #opt.model_ckpt='caption_0914_1947' # 预训练的模型

    # 数据w
    vis = Visualizer(env=opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # 模型
    model = CaptionModel(opt, word2ix, ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)
    optimizer = model.get_optimizer(opt.lr)
    criterion = t.nn.CrossEntropyLoss()
    if opt.use_gpu:
        model.cuda()
        criterion.cuda()

    # 统计
    loss_meter = meter.AverageValueMeter()

    for epoch in range(opt.epoch):
        loss_meter.reset()
        for ii, (imgs, (captions, lengths),
                 indexes) in tqdm.tqdm(enumerate(dataloader)):
            # 训练
            optimizer.zero_grad()
            if opt.use_gpu:
                imgs = imgs.cuda()
                captions = captions.cuda()
            imgs = Variable(imgs)
            captions = Variable(captions)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]
            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            optimizer.step()
            loss_meter.add(loss.data[0])
            '''
            if (ii+1)%opt.plot_every ==0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss',loss_meter.value()[0])

                # 可视化原始图片 + 可视化人工的描述语句
                raw_img = _data['ix2id'][indexes[0]]
                img_path=opt.img_path+raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)

                raw_caption = captions.data[:,0]
                raw_caption = ''.join([_data['ix2word'][int(ii)] for ii in raw_caption])
                vis.text(raw_caption,u'raw_caption')
                vis.img('raw',raw_img,caption=raw_caption)

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results),u'caption')
                '''
        model.save()
Exemple #20
0
                                               transform=mytransform,
                                               train=True)
    flicker8k_val = FlickrDataLoader.Flicker8k(img_dir,
                                               cap_path,
                                               val_txt,
                                               transform=mytransform,
                                               train=True)
    with open('feat6k.npy', 'r') as f:
        feat_tr = np.load(f)

    with open('capt6k.pkl', 'r') as f:
        caption_trn = pickle.load(f)

    with open('feat.pkl', 'r') as f:
        feat_val = pickle.load(f)

    with open('capt1k.pkl', 'r') as f:
        caption_val = pickle.load(f)

    model = CaptionModel(bsz=1,
                         feat_dim=(196, 512),
                         n_voc=5834,
                         n_embed=512,
                         n_hidden=1024).cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train(epoches=1)
    with open('model_t.pth', 'r') as f:
        model.load_state_dict(torch.load(f))
def train_fn(config):
    """Main training function. To be called by `try_to_train()`."""

    #print('TensorFlow version: r{}'.format(tf.__version__))
    print('INFO: Logging to `{}`.'.format(config.log_path))

    # Setup input pipeline & Build model
    g = tf.Graph()
    with g.as_default():
        tf.set_random_seed(config.rand_seed)
        if config.token_type == 'radix':
            inputs_man = inputs.InputManager_Radix(config)
        elif config.token_type == 'char':
            inputs_man = inputs.InputManager_Char(config)
        else:
            inputs_man = inputs.InputManager(config)
        c = inputs_man.config

        num_batches = int(c.split_sizes['train'] / c.batch_size_train)
        lr = c.lr_start
        n_steps_log = int(num_batches / c.num_logs_per_epoch)

        with tf.name_scope('train'):
            m_train = CaptionModel(c,
                                   mode='train',
                                   batch_ops=inputs_man.batch_train,
                                   reuse=False,
                                   name='train')
            m_train.dset_size = c.split_sizes['train']

        with tf.name_scope('valid'):
            m_valid = CaptionModel(c,
                                   mode='eval',
                                   batch_ops=inputs_man.batch_eval,
                                   reuse=True,
                                   name='valid')
            m_valid.dset_size = c.split_sizes['valid']

        init_fn = tf.global_variables_initializer()
        model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'Model')
        model_saver = tf.train.Saver(var_list=model_vars,
                                     max_to_keep=c.max_saves)
        saver = tf.train.Saver(max_to_keep=2)

    r = c.per_process_gpu_memory_fraction
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=r)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options), graph=g)
    summary_writer = tf.summary.FileWriter(c.log_path, g)

    with sess:
        # Restore model from checkpoint if provided
        sess.run(init_fn)
        lr = m_train.restore_model(sess, saver, lr)
        g.finalize()
        #ops.get_model_size(scope_or_list=m_train._get_trainable_vars(),
        ops.get_model_size(scope_or_list='Model/decoder/rnn_decoder',
                           log_path=c.log_path)
        start_step = sess.run(m_train.global_step)
        n_steps_log = int(n_steps_log / 5)

        print('INFO: Graph constructed. Training begins now.')
        start_epoch = time.time()

        for step in xrange(start_step, c.max_step):
            epoch = int(step / num_batches) + 1

            # Write summary to disk once every `n_steps_log` steps
            if (step + 1) % (n_steps_log * 5) == 0:
                ppl, summary, global_step, lr = sess.run([
                    m_train.dec_log_ppl, m_train.summary_op,
                    m_train.global_step, m_train.lr
                ])
                t = time.time() - start_epoch
                speed = (step + 1 - start_step) * c.batch_size_train / t
                print('   Training speed: {:7.2f} examples/sec.'.format(speed))
                summary_writer.add_summary(summary, global_step)
                value_summary({'train/speed': speed}, summary_writer,
                              global_step)
            # Quick logging
            elif (step + 1) % n_steps_log == 0:
                ppl, global_step, lr = sess.run(
                    [m_train.dec_log_ppl, m_train.global_step, m_train.lr])
                ppl = np.exp(ppl)
                logstr = 'Epoch {:2d} ~~ {:6.2f} %  ~  '.format(
                    epoch, ((step % num_batches) + 1) / num_batches * 100)
                logstr += 'Perplexity {:8.4f} ~ LR {:5.3e} ~ '.format(ppl, lr)
                logstr += 'Step {}'.format(global_step)
                print('   ' + logstr)
            else:
                ppl, global_step = sess.run(
                    [m_train.dec_log_ppl, m_train.global_step])

            if num_batches > 5000:
                save = (step + 1) % int(num_batches / 2) == 0
            else:
                save = (step + 1) % num_batches == 0
            save = save and (step + 100) < c.max_step

            # Evaluation and save model
            if save or (step + 1) == c.max_step:
                model_saver.save(sess, c.save_path + '_compact', global_step)
                saver.save(sess, c.save_path, global_step)
                _run_eval_loop(sess, c, m_valid, summary_writer, global_step)

            if (step + 1) % num_batches == 0:
                if c.legacy:
                    lr = _lr_reduce_check(config, epoch, lr)
                    m_train.update_lr(sess, lr)
                    sess.run(m_train.lr)
                t = time.time() - start_epoch
                print('\n\n>>> Epoch {:3d} complete'.format(epoch))
                print('>>> Time taken: {:10.2f} minutes\n\n'.format(t / 60))
                start_epoch = time.time()
                start_step = step + 1

        sess.close()
        print('\n\nINFO: Training completed.')
Exemple #22
0
def main():
    global args
    args = parser.parse_args()
    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar')

    logging.debug("run arguments: %s", args)
    logging.info("using pretrained cnn %s", args.cnn)
    cnn = resnet.__dict__[args.cnn](pretrained=True)

    vocab = build_vocab()
    model = CaptionModel(cnn, vocab,
                         embedding_size=args.embedding_size,
                         rnn_size=args.rnn_size,
                         num_layers=args.num_layers,
                         share_embedding_weights=args.share_weights)

    train_data = get_iterator(get_coco_data(vocab, train=True),
                              batch_size=args.batch_size,
                              max_length=args.max_length,
                              shuffle=True,
                              num_workers=args.workers)
    val_data = get_iterator(get_coco_data(vocab, train=False),
                            batch_size=args.eval_batch_size,
                            max_length=args.max_length,
                            shuffle=False,
                            num_workers=args.workers)

    if 'cuda' in args.type:
        cudnn.benchmark = True
        model.cuda()

    optimizer = select_optimizer(
        args.optimizer, params=model.parameters(), lr=args.lr)
    regime = lambda e: {'lr': args.lr * (args.lr_decay ** e),
                        'momentum': args.momentum,
                        'weight_decay': args.weight_decay}
    model.finetune_cnn(False)

    def forward(model, data, training=True, optimizer=None):
        use_cuda = 'cuda' in args.type
        loss = nn.CrossEntropyLoss()
        perplexity = AverageMeter()
        batch_time = AverageMeter()
        data_time = AverageMeter()

        if training:
            model.train()
        else:
            model.eval()

        end = time.time()
        for i, (imgs, (captions, lengths)) in enumerate(data):
            data_time.update(time.time() - end)
            if use_cuda:
                imgs = imgs.cuda()
                captions = captions.cuda(async=True)
            imgs = Variable(imgs, volatile=not training)
            captions = Variable(captions, volatile=not training)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]

            pred, _ = model(imgs, input_captions, lengths)
            err = loss(pred, target_captions)
            perplexity.update(math.exp(err.data[0]))

            if training:
                optimizer.zero_grad()
                err.backward()
                clip_grad_norm(model.rnn.parameters(), args.grad_clip)
                optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i % args.print_freq == 0:
                logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t'
                             'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                             'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                             'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format(
                                 epoch, i, len(data),
                                 phase='TRAINING' if training else 'EVALUATING',
                                 batch_time=batch_time,
                                 data_time=data_time, perp=perplexity))

        return perplexity.avg

    for epoch in range(args.start_epoch, args.epochs):
        if epoch >= args.finetune_epoch:
            model.finetune_cnn(True)
        optimizer = adjust_optimizer(
            optimizer, epoch, regime)
        # Train
        train_perp = forward(
            model, train_data, training=True, optimizer=optimizer)
        # Evaluate
        val_perp = forward(model, val_data, training=False)

        logging.info('\n Epoch: {0}\t'
                     'Training Perplexity {train_perp:.4f} \t'
                     'Validation Perplexity {val_perp:.4f} \n'
                     .format(epoch + 1, train_perp=train_perp, val_perp=val_perp))
        model.save_checkpoint(checkpoint_file % (epoch + 1))
Exemple #23
0
def main(hf,
         f_type,
         final=False,
         capl=16,
         d_w2v=512,
         output_dim=512,
         feature_shape=None,
         lr=0.01,
         batch_size=64,
         total_epoch=100,
         file=None,
         pretrained_model=None):
    '''
		capl: the length of caption
	'''

    # Create vocabulary
    if final:
        v2i, train_data, test_data = MsrFinalDataUtil.create_vocabulary_word2vec(
            file,
            capl=capl,
            word_threshold=1,
            v2i={
                '': 0,
                'UNK': 1,
                'BOS': 2,
                'EOS': 3
            },
            num_training=9000)
    else:
        v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(
            file,
            capl=capl,
            word_threshold=1,
            v2i={
                '': 0,
                'UNK': 1,
                'BOS': 2,
                'EOS': 3
            })
    i2v = {i: v for v, i in v2i.items()}

    print('building model ...')
    voc_size = len(v2i)

    input_video = tf.placeholder(tf.float32,
                                 shape=(None, ) + feature_shape,
                                 name='input_video')
    input_captions = tf.placeholder(tf.int32,
                                    shape=(None, capl),
                                    name='input_captions')
    y = tf.placeholder(tf.int32, shape=(None, capl))

    attentionCaptionModel = CaptionModel.GRUAttentionBeamsearchCaptionModel(
        input_video,
        input_captions,
        voc_size,
        d_w2v,
        output_dim,
        max_len=16,
        beamsearch_batchsize=1,
        beam_size=5)

    predict_score, predict_words, loss_mask, finished_beam, logprobs_finished_beams, past_logprobs = attentionCaptionModel.build_model(
    )
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                          logits=predict_score)

    loss = tf.reduce_sum(loss, reduction_indices=[-1]) / tf.reduce_sum(
        loss_mask, reduction_indices=[-1])

    loss = tf.reduce_mean(loss) + sum(
        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

    optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08,
                                       use_locking=False,
                                       name='Adam')

    gvs = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var)
                  for grad, var in gvs]
    train = optimizer.apply_gradients(capped_gvs)

    # optimizer = tf.train.RMSPropOptimizer(lr,decay=0.9, momentum=0.0, epsilon=1e-8)
    # train = optimizer.minimize(loss)
    '''
		configure && runtime environment
	'''
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    config.log_device_placement = False

    sess = tf.Session(config=config)

    init = tf.global_variables_initializer()
    sess.run(init)

    with sess.as_default():
        saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
        if pretrained_model is not None:
            saver.restore(sess, pretrained_model)
            print('restore pre trained file:' + pretrained_model)

        for epoch in xrange(total_epoch):
            # # # shuffle
            print('Epoch: %d/%d, Batch_size: %d' %
                  (epoch + 1, total_epoch, batch_size))
            # train phase
            tic = time.time()
            total_loss = exe_train(sess,
                                   train_data,
                                   batch_size,
                                   v2i,
                                   hf,
                                   feature_shape,
                                   train,
                                   loss,
                                   input_video,
                                   input_captions,
                                   y,
                                   capl=capl)

            print('    --Train--, Loss: %.5f, .......Time:%.3f' %
                  (total_loss, time.time() - tic))

            tic = time.time()
            js = exe_test(sess,
                          test_data,
                          1,
                          v2i,
                          i2v,
                          hf,
                          feature_shape,
                          predict_words,
                          input_video,
                          input_captions,
                          y,
                          finished_beam,
                          logprobs_finished_beams,
                          past_logprobs,
                          capl=capl)
            print('    --Val--, ......Time:%.3f' % (time.time() - tic))

            #save model
            export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s' + '_' + f_type + '/' + 'lr' + str(
                lr) + '_f' + str(feature_shape[0]) + '_B' + str(batch_size)
            if not os.path.exists(export_path + '/model'):
                os.makedirs(export_path + '/model')
                print('mkdir %s' % export_path + '/model')
            if not os.path.exists(export_path + '/res'):
                os.makedirs(export_path + '/res')
                print('mkdir %s' % export_path + '/res')

            # eval
            res_path = export_path + '/res/E' + str(epoch + 1) + '.json'
            evaluate_mode_by_shell(res_path, js)