def exe_test(sess,
             data,
             batch_size,
             v2i,
             i2v,
             hf1,
             hf2,
             hf3,
             feature_shape1,
             feature_shape2,
             predict_words,
             input_video1,
             input_video2,
             input_video3,
             input_captions,
             y,
             capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size)) + 1

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1,
                                                   feature_shape1)
        data_v2 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf2,
                                                   feature_shape2)
        data_v3 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf3,
                                                   feature_shape3)

        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        [gw] = sess.run(
            [predict_words],
            feed_dict={
                input_video1: data_v1,
                input_video2: data_v2,
                input_video3: data_v3,
                input_captions: data_c,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
def exe_train(sess,
              data,
              batch_size,
              v2i,
              hf1,
              hf2,
              hf3,
              feature_shape1,
              feature_shape2,
              train,
              loss,
              input_video1,
              input_video2,
              input_video3,
              input_captions,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1,
                                                   feature_shape1)
        data_v2 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf2,
                                                   feature_shape2)
        data_v3 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf3,
                                                   feature_shape3)

        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video1: data_v1,
                input_video2: data_v2,
                input_video3: data_v3,
                input_captions: data_c,
                y: data_y
            })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss
Beispiel #3
0
def beam_search_exe_test(sess,
                         data,
                         cate_info,
                         batch_size,
                         v2i,
                         i2v,
                         hf,
                         feature_shape,
                         predict_words,
                         input_video,
                         input_captions,
                         input_categories,
                         y,
                         finished_beam,
                         logprobs_finished_beams,
                         past_logprobs,
                         capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        [gw, tw, gp, gl] = sess.run(
            [
                finished_beam, predict_words, logprobs_finished_beams,
                past_logprobs
            ],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))

            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
Beispiel #4
0
def exe_test(sess, data, batch_size, v2i, i2v, hf, feature_shape, 
	predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states, capl=16):
	
	caption_output = []
	total_data = len(data)
	num_batch = int(round(total_data*1.0/batch_size))

	for batch_idx in xrange(num_batch):
		batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
		
		data_v = MsrDataUtil.getBatchVideoFeature(batch_caption,hf,feature_shape)
		data_c, data_y = MsrDataUtil.getBatchTestCaption(batch_caption, v2i, capl=capl)
		[gw,tw,gp, gl, pp, pss, fbs] = sess.run([finished_beam, predict_words, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states],feed_dict={input_video:data_v, input_captions:data_c, y:data_y})

		generated_captions = MsrDataUtil.convertCaptionI2V(batch_caption, gw, i2v)

		for idx, sen in enumerate(generated_captions):
			# print(gw[idx])
			print('%s : %s' %(batch_caption[idx].keys()[0],sen))
			# print(pss)
			# print(fbs)
			# print(gp)
			# print(gl)
			# print(pp)
			caption_output.append({'image_id':batch_caption[idx].keys()[0],'caption':sen})
	
	js = {}
	js['val_predictions'] = caption_output

	return js
def exe_train(sess, data, batch_size, v2i, hf1, hf2, feature_shape, 
	train, loss, input_video, input_captions, y, capl=16):

	np.random.shuffle(data)

	total_data = len(data)
	num_batch = int(round(total_data*1.0/batch_size))

	total_loss = 0.0
	for batch_idx in xrange(num_batch):
	# for batch_idx in xrange(500):

		# if batch_idx < 100:
		batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]

		data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption,hf1,(feature_shape[0],2048))
		data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption,hf2,(feature_shape[0],4096))

		# data_v1 = data_v1/(np.linalg.norm(data_v1, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon)
		# data_v2 = data_v2/(np.linalg.norm(data_v2, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon)

		data_v = np.concatenate((data_v1,data_v2),axis=-1)
		data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(batch_caption, v2i, capl=capl)

		_, l = sess.run([train,loss],feed_dict={input_video:data_v, input_captions:data_c,  y:data_y})
		total_loss += l
		print('    batch_idx:%d/%d, loss:%.5f' %(batch_idx+1,num_batch,l))
	total_loss = total_loss/num_batch
	return total_loss
Beispiel #6
0
def exe_train(sess,
              data,
              audio_info,
              cate_info,
              batch_size,
              v2i,
              hf,
              feature_shape,
              train,
              loss,
              input_video,
              input_captions,
              input_categories,
              input_audio,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        data_audio = MsrDataUtil.getBatchVideoAudioInfo(
            batch_caption, audio_info, feature_shape)

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                input_audio: data_audio,
                y: data_y
            })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss
Beispiel #7
0
def exe_test(sess,
             data,
             batch_size,
             v2i,
             i2v,
             hf,
             feature_shape,
             predict_words,
             input_video,
             input_captions,
             y,
             predict_words2,
             capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaption(batch_caption,
                                                         v2i,
                                                         capl=capl)
        [gw, tw] = sess.run([predict_words, predict_words2],
                            feed_dict={
                                input_video: data_v,
                                input_captions: data_c,
                                y: data_y
                            })
        # print(gw)
        # print(tp)
        # print(atp)
        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
def exe_train(sess,
              data,
              batch_size,
              v2i,
              hf,
              feature_shape,
              train,
              loss,
              input_video,
              input_captions,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        flag = np.random.randint(0, 2)
        if flag == 1:
            data_v = data_v[:, ::-1, :]
        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)

        _, l = sess.run([train, loss],
                        feed_dict={
                            input_video: data_v,
                            input_captions: data_c,
                            y: data_y
                        })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss
def exe_unsup_test(sess, data, batch_size, v2i, hf, unsup_training_feature_shape, 
	loss, unsup_input_feature, unsup_decoder_feature, true_video, flip=True, capl=16):


	total_data = len(data)
	num_batch = int(round(total_data*1.0/batch_size))

	total_loss = 0.0
	for batch_idx in xrange(num_batch):
	# for batch_idx in xrange(500):

		# if batch_idx < 100:
		batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
		number_sample = len(batch_caption)
		data_v = MsrDataUtil.getBatchVideoFeature(batch_caption,hf,(80,2048))

		# if flip:
		# 	isflip = np.random.randint(0,2)
		# 	if isflip==1:
		# 		data_v = data_v[:,::-1]

		# start = np.random.randint(0,10)
		# data_v = data_v[:,start:start+30]

		input_v = data_v[:,0:40,:]

		input_pred_v = np.zeros((number_sample,40,2048),dtype=np.float32)

		input_pred_v[:,1::]=data_v[:,40:79]
		gt_video = data_v[:,40::]

		l = sess.run(loss,feed_dict={unsup_input_feature:input_v, unsup_decoder_feature:input_pred_v, true_video:gt_video})
		total_loss += l
		print('    batch_idx:%d/%d, loss:%.5f' %(batch_idx+1,num_batch,l))
	total_loss = total_loss/num_batch

	return total_loss
def exe_test(sess, data, batch_size, v2i, i2v, hf1, hf2, feature_shape, 
	predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states, capl=16):
	
	caption_output = []
	total_data = len(data)
	
	num_batch = int(math.ceil((total_data*1.0/batch_size)))
	print(num_batch)
	


	for batch_idx in xrange(num_batch):
		batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
		
		data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption,hf1,(feature_shape[0],2048))
		data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption,hf2,(feature_shape[0],4096))

		# data_v1 = data_v1/(np.linalg.norm(data_v1, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon)
		# data_v2 = data_v2/(np.linalg.norm(data_v2, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon)

		data_v = np.concatenate((data_v1,data_v2),axis=-1)
		
		data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(batch_caption, v2i, capl=capl)
		# [gw,tw,gp, gl, pp, pss, fbs] = sess.run([finished_beam, predict_words, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states],feed_dict={input_video:data_v, input_captions:data_c, y:data_y})
		[tw] = sess.run([predict_words],feed_dict={input_video:data_v, input_captions:data_c, y:data_y})

		generated_captions = MsrDataUtil.convertCaptionI2V(batch_caption, tw, i2v)

		for idx, sen in enumerate(generated_captions):
			print('%s : %s' %(batch_caption[idx].keys()[0],sen))
			caption_output.append({'image_id':batch_caption[idx].keys()[0],'caption':sen})
	
	js = {}
	js['val_predictions'] = caption_output

	return js