def beam_search_exe_test(sess,
                         data,
                         audio_info,
                         batch_size,
                         v2i,
                         i2v,
                         hf,
                         feature_shape,
                         predict_words,
                         input_video,
                         input_captions,
                         input_audio,
                         y,
                         finished_beam,
                         logprobs_finished_beams,
                         past_logprobs,
                         capl=16):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_audio = MsrDataUtil.getBatchVideoAudioInfo(
            batch_caption, audio_info)
        [gw, tw, gp, gl] = sess.run(
            [
                finished_beam, predict_words, logprobs_finished_beams,
                past_logprobs
            ],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_audio: data_audio,
                y: data_y
            })

        generated_captions = MsrDataUtil.convertCaptionI2V(
            batch_caption, gw, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))

            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
Beispiel #2
0
def exe_train(sess,
              data,
              audio_info,
              cate_info,
              batch_size,
              v2i,
              hf,
              feature_shape,
              train,
              loss,
              input_video,
              input_captions,
              input_categories,
              input_audio,
              y,
              capl=16):

    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))

    total_loss = 0.0
    for batch_idx in xrange(num_batch):
        # for batch_idx in xrange(500):

        # if batch_idx < 100:
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf,
                                                  feature_shape)
        data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_cate = MsrDataUtil.getBatchVideoCategoriesInfo(
            batch_caption, cate_info, feature_shape)
        data_audio = MsrDataUtil.getBatchVideoAudioInfo(
            batch_caption, audio_info, feature_shape)

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video: data_v,
                input_captions: data_c,
                input_categories: data_cate,
                input_audio: data_audio,
                y: data_y
            })
        total_loss += l
        print('    batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l))
    total_loss = total_loss / num_batch
    return total_loss