Beispiel #1
0
def beam_search_exe_test(sess,
                         data,
                         cate_info,
                         batch_size,
                         v2i,
                         i2v,
                         hf,
                         feature_shape,
                         predict_words,
                         input_video,
                         input_captions,
                         input_categories,
                         y,
                         finished_beam,
                         logprobs_finished_beams,
                         past_logprobs,
                         capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        [gw, tw, gp, gl] = sess.run(
            [
                finished_beam, predict_words, logprobs_finished_beams,
                past_logprobs
            ],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))

            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
def exe_train(sess,
              data,
              cate_info,
              batch_size,
              v2i,
              hf1,
              hf2,
              feature_shape1,
              feature_shape2,
              train,
              loss,
              input_video1,
              input_video2,
              input_captions,
              input_categories,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1,
                                                   feature_shape1)
        data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption, hf2,
                                                      feature_shape2)

        flag = np.random.randint(0, 2)
        if flag == 1:
            data_v1 = data_v1[:, ::-1, :]
            data_v2 = data_v2[:, ::-1, :]

        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape1)

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video1: data_v1,
                input_video2: data_v2,
                input_captions: data_c,
                input_categories: data_cate,
                y: data_y
            })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss
Beispiel #3
0
def exe_test(sess, data, cate_info, batch_size, v2i, i2v,  hf1, hf2, feature_shape1, feature_shape2, 
	predict_words, input_video1, input_video2, input_captions, input_categories, y, capl=16):
	
	caption_output = []
	total_data = len(data)
	num_batch = int(round(total_data*1.0/batch_size))+1

	for batch_idx in xrange(num_batch):
		batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
		
		data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption,hf1,feature_shape1)
		data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption,hf2,feature_shape2)
		
		data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(batch_caption, v2i, capl=capl)
		data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(batch_caption, cate_info, feature_shape1)
		
		[gw] = sess.run([predict_words],feed_dict={input_video1:data_v1, input_video2:data_v2, input_captions:data_c, input_categories:data_cate, y:data_y})

		generated_captions = MsrDataUtil.convertCaptionI2V(batch_caption, gw, i2v)

		for idx, sen in enumerate(generated_captions):
			print('%s : %s' %(batch_caption[idx].keys()[0],sen))
			caption_output.append({'image_id':batch_caption[idx].keys()[0],'caption':sen})
	
	js = {}
	js['val_predictions'] = caption_output

	return js
Beispiel #4
0
def exe_train(sess,
              data,
              audio_info,
              cate_info,
              batch_size,
              v2i,
              hf,
              feature_shape,
              train,
              loss,
              input_video,
              input_captions,
              input_categories,
              input_audio,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        data_audio = MsrDataUtil.getBatchVideoAudioInfo(
            batch_caption, audio_info, feature_shape)

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                input_audio: data_audio,
                y: data_y
            })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss
Beispiel #5
0
def beamsearch_exe_test(sess,
                        data,
                        cate_info,
                        batch_size,
                        v2i,
                        i2v,
                        hf1,
                        hf2,
                        feature_shape1,
                        feature_shape2,
                        predict_words,
                        input_video1,
                        input_video2,
                        input_captions,
                        input_categories,
                        y,
                        finished_beam,
                        logprobs_finished_beams,
                        capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v1 = MsrFinalDataUtil.getBatchVideoFeature(
            batch_caption, hf1, feature_shape1)
        data_v2 = MsrFinalDataUtil.getBatchC3DVideoFeature(
            batch_caption, hf2, feature_shape2)

        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape1)

        [tw, gw, gp] = sess.run(
            [predict_words, finished_beam, logprobs_finished_beams],
            feed_dict={
                input_video1: data_v1,
                input_video2: data_v2,
                input_captions: data_c,
                input_categories: data_cate,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            caption_output.append({
                'video_id': batch_caption[idx].keys()[0],
                'caption': sen.strip()
            })

    js = {}
    js['result'] = caption_output
    js['version'] = '3'
    js['external_data'] = {
        'used':
        'true',
        'details':
        'the features of video frames are extracted by ResNet152 and C3D'
    }

    return js