def exe_test(sess, data, batch_size, v2i, i2v, hf1, hf2, hf3, feature_shape1, feature_shape2, predict_words, input_video1, input_video2, input_video3, input_captions, y, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) + 1 for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1, feature_shape1) data_v2 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf2, feature_shape2) data_v3 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf3, feature_shape3) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) [gw] = sess.run( [predict_words], feed_dict={ input_video1: data_v1, input_video2: data_v2, input_video3: data_v3, input_captions: data_c, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_train(sess, data, batch_size, v2i, hf1, hf2, hf3, feature_shape1, feature_shape2, train, loss, input_video1, input_video2, input_video3, input_captions, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1, feature_shape1) data_v2 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf2, feature_shape2) data_v3 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf3, feature_shape3) data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel( batch_caption, v2i, capl=capl) _, l = sess.run( [train, loss], feed_dict={ input_video1: data_v1, input_video2: data_v2, input_video3: data_v3, input_captions: data_c, y: data_y }) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l)) total_loss = total_loss / num_batch return total_loss
def beam_search_exe_test(sess, data, cate_info, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, input_categories, y, finished_beam, logprobs_finished_beams, past_logprobs, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_cate = MsrDataUtil.getBatchVideoCategoriesInfo( batch_caption, cate_info, feature_shape) [gw, tw, gp, gl] = sess.run( [ finished_beam, predict_words, logprobs_finished_beams, past_logprobs ], feed_dict={ input_video: data_v, input_captions: data_c, input_categories: data_cate, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_test(sess, data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data*1.0/batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption,hf,feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaption(batch_caption, v2i, capl=capl) [gw,tw,gp, gl, pp, pss, fbs] = sess.run([finished_beam, predict_words, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states],feed_dict={input_video:data_v, input_captions:data_c, y:data_y}) generated_captions = MsrDataUtil.convertCaptionI2V(batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): # print(gw[idx]) print('%s : %s' %(batch_caption[idx].keys()[0],sen)) # print(pss) # print(fbs) # print(gp) # print(gl) # print(pp) caption_output.append({'image_id':batch_caption[idx].keys()[0],'caption':sen}) js = {} js['val_predictions'] = caption_output return js
def exe_train(sess, data, batch_size, v2i, hf1, hf2, feature_shape, train, loss, input_video, input_captions, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data*1.0/batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption,hf1,(feature_shape[0],2048)) data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption,hf2,(feature_shape[0],4096)) # data_v1 = data_v1/(np.linalg.norm(data_v1, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon) # data_v2 = data_v2/(np.linalg.norm(data_v2, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon) data_v = np.concatenate((data_v1,data_v2),axis=-1) data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel(batch_caption, v2i, capl=capl) _, l = sess.run([train,loss],feed_dict={input_video:data_v, input_captions:data_c, y:data_y}) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' %(batch_idx+1,num_batch,l)) total_loss = total_loss/num_batch return total_loss
def exe_train(sess, data, audio_info, cate_info, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, input_categories, input_audio, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_cate = MsrDataUtil.getBatchVideoCategoriesInfo( batch_caption, cate_info, feature_shape) data_audio = MsrDataUtil.getBatchVideoAudioInfo( batch_caption, audio_info, feature_shape) _, l = sess.run( [train, loss], feed_dict={ input_video: data_v, input_captions: data_c, input_categories: data_cate, input_audio: data_audio, y: data_y }) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l)) total_loss = total_loss / num_batch return total_loss
def exe_test(sess, data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, predict_words2, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaption(batch_caption, v2i, capl=capl) [gw, tw] = sess.run([predict_words, predict_words2], feed_dict={ input_video: data_v, input_captions: data_c, y: data_y }) # print(gw) # print(tp) # print(atp) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_train(sess, data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) flag = np.random.randint(0, 2) if flag == 1: data_v = data_v[:, ::-1, :] data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel( batch_caption, v2i, capl=capl) _, l = sess.run([train, loss], feed_dict={ input_video: data_v, input_captions: data_c, y: data_y }) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l)) total_loss = total_loss / num_batch return total_loss
def exe_unsup_test(sess, data, batch_size, v2i, hf, unsup_training_feature_shape, loss, unsup_input_feature, unsup_decoder_feature, true_video, flip=True, capl=16): total_data = len(data) num_batch = int(round(total_data*1.0/batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] number_sample = len(batch_caption) data_v = MsrDataUtil.getBatchVideoFeature(batch_caption,hf,(80,2048)) # if flip: # isflip = np.random.randint(0,2) # if isflip==1: # data_v = data_v[:,::-1] # start = np.random.randint(0,10) # data_v = data_v[:,start:start+30] input_v = data_v[:,0:40,:] input_pred_v = np.zeros((number_sample,40,2048),dtype=np.float32) input_pred_v[:,1::]=data_v[:,40:79] gt_video = data_v[:,40::] l = sess.run(loss,feed_dict={unsup_input_feature:input_v, unsup_decoder_feature:input_pred_v, true_video:gt_video}) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' %(batch_idx+1,num_batch,l)) total_loss = total_loss/num_batch return total_loss
def exe_test(sess, data, batch_size, v2i, i2v, hf1, hf2, feature_shape, predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states, capl=16): caption_output = [] total_data = len(data) num_batch = int(math.ceil((total_data*1.0/batch_size))) print(num_batch) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption,hf1,(feature_shape[0],2048)) data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption,hf2,(feature_shape[0],4096)) # data_v1 = data_v1/(np.linalg.norm(data_v1, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon) # data_v2 = data_v2/(np.linalg.norm(data_v2, ord=None, axis=-1, keepdims=True)+sys.float_info.epsilon) data_v = np.concatenate((data_v1,data_v2),axis=-1) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel(batch_caption, v2i, capl=capl) # [gw,tw,gp, gl, pp, pss, fbs] = sess.run([finished_beam, predict_words, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states],feed_dict={input_video:data_v, input_captions:data_c, y:data_y}) [tw] = sess.run([predict_words],feed_dict={input_video:data_v, input_captions:data_c, y:data_y}) generated_captions = MsrDataUtil.convertCaptionI2V(batch_caption, tw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' %(batch_caption[idx].keys()[0],sen)) caption_output.append({'image_id':batch_caption[idx].keys()[0],'caption':sen}) js = {} js['val_predictions'] = caption_output return js