Beispiel #1
0
def main(hf,f_type,capl=16, d_w2v=512, output_dim=512,
		feature_shape=None,lr=0.01,
		batch_size=64,total_epoch=100,
		file=None,pretrained_model=None):
	'''
		capl: the length of caption
	'''

	# Create vocabulary
	v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(file, capl=capl, word_threshold=1, v2i={'': 0, 'UNK':1,'BOS':2, 'EOS':3})

	i2v = {i:v for v,i in v2i.items()}

	print('building model ...')
	voc_size = len(v2i)

	input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
	input_captions = tf.placeholder(tf.int32, shape=(None,capl), name='input_captions')
	y = tf.placeholder(tf.int32,shape=(None, capl))

	attentionCaptionModel = mGRUCaptionModel.mLSTMAttentionCaptionModel(input_video, input_captions, voc_size, d_w2v, output_dim, T_k=[1,2,4,8])
	predict_score, predict_words, loss_mask = attentionCaptionModel.build_model()
	loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=predict_score)

	loss = tf.reduce_sum(loss,reduction_indices=[-1])/tf.reduce_sum(loss_mask,reduction_indices=[-1])

	loss = tf.reduce_mean(loss)+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

	optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam')
	

	gvs = optimizer.compute_gradients(loss)
	capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in gvs ]
	train = optimizer.apply_gradients(capped_gvs)

	# optimizer = tf.train.RMSPropOptimizer(lr,decay=0.9, momentum=0.0, epsilon=1e-8)
	# train = optimizer.minimize(loss)

	'''
		configure && runtime environment
	'''
	config = tf.ConfigProto()
	config.gpu_options.per_process_gpu_memory_fraction = 0.3
	# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
	config.log_device_placement=False

	sess = tf.Session(config=config)

	init = tf.global_variables_initializer()
	sess.run(init)

	with sess.as_default():
		saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
		if pretrained_model is not None:
			saver.restore(sess, pretrained_model)
			print('restore pre trained file:' + pretrained_model)

		for epoch in xrange(total_epoch):
			# # shuffle
			print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
			# # train phase
			tic = time.time()
			total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=capl)

			print('    --Train--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic))

			tic = time.time()
			js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape, 
										predict_words, input_video, input_captions, y, capl=capl)
			print('    --Val--, .......Time:%.3f' %(time.time()-tic))

			

			#save model
			export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s'+'_'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size)
			if not os.path.exists(export_path+'/model'):
				os.makedirs(export_path+'/model')
				print('mkdir %s' %export_path+'/model')
			if not os.path.exists(export_path+'/res'):
				os.makedirs(export_path+'/res')
				print('mkdir %s' %export_path+'/res')

			# eval
			res_path = export_path+'/res/'+f_type+'_E'+str(epoch+1)+'.json'
			evaluate_mode_by_shell(res_path,js)


			save_path = saver.save(sess, export_path+'/model/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt')
			print("Model saved in file: %s" % save_path)
Beispiel #2
0
def beam_search_exe_test(sess,
                         data,
                         audio_info,
                         cate_info,
                         batch_size,
                         v2i,
                         i2v,
                         hf,
                         feature_shape,
                         predict_words,
                         input_video,
                         input_captions,
                         input_categories,
                         input_audio,
                         y,
                         finished_beam,
                         logprobs_finished_beams,
                         past_logprobs,
                         capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        data_audio = MsrDataUtil.getBatchVideoAudioInfo(
            batch_caption, audio_info, feature_shape)
        [gw, tw, gp, gl] = sess.run(
            [
                finished_beam, predict_words, logprobs_finished_beams,
                past_logprobs
            ],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                input_audio: data_audio,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))

            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
Beispiel #3
0
def exe_test(sess,
             data,
             cate_info,
             batch_size,
             v2i,
             i2v,
             hf1,
             hf2,
             feature_shape1,
             feature_shape2,
             predict_words,
             input_video1,
             input_video2,
             input_captions,
             input_categories,
             y,
             capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size)) + 1

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1,
                                                   feature_shape1)
        data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption, hf2,
                                                      feature_shape2)

        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape1)

        [gw] = sess.run(
            [predict_words],
            feed_dict={
                input_video1: data_v1,
                input_video2: data_v2,
                input_captions: data_c,
                input_categories: data_cate,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
def exe_test(sess,
             data,
             batch_size,
             v2i,
             i2v,
             hf,
             feature_shape,
             predict_words,
             input_video,
             input_captions,
             y,
             finished_beam,
             logprobs_finished_beams,
             past_logprobs,
             beam_hidden_state,
             past_symbols_states,
             finished_beams_states,
             capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        [gw, tw, gp, gl, pp, pss, fbs] = sess.run([
            finished_beam, predict_words, logprobs_finished_beams,
            past_logprobs, beam_hidden_state, past_symbols_states,
            finished_beams_states
        ],
                                                  feed_dict={
                                                      input_video: data_v,
                                                      input_captions: data_c,
                                                      y: data_y
                                                  })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            # print(gw[idx])
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            # print(pss)
            # print(fbs)
            # print(gp)
            # print(gl)
            # print(pp)
            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js