Ejemplo n.º 1
0
def exe_train(sess,
              data,
              epoch,
              batch_size,
              v2i,
              hf,
              obj_hfs,
              feature_shape,
              train,
              loss,
              input_video,
              input_object,
              input_captions,
              y,
              capl=16,
              obj_file=None,
              obj_num=5):

    #print '###############', len(data), data[0]
    np.random.shuffle(data)

    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))
    #num_batch = 5
    total_loss = 0.0
    #2018
    #print '###############', len(data), data[0]
    for batch_idx in xrange(num_batch):

        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]
        tic = time.time()

        if args.step:
            assert (1 == 0)
        else:
            data_v, data_obj = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape, obj_file)
            assert (data_obj.shape[1] == obj_num and len(data_obj.shape) == 6)
            assert (data_v.shape[1] == data_obj.shape[2])
        if args.bidirectional:
            flag = np.random.randint(0, 2)
            if flag == 1:
                data_v = data_v[:, ::-1]
                data_obj = data_obj[:, :, ::-1]
        data_c, data_y = SeqVladDataUtil.getBatchTrainCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        data_time = time.time() - tic
        tic = time.time()

        _, l = sess.run(
            [train, loss],
            feed_dict={
                input_video: data_v,
                input_object: data_obj,
                input_captions: data_c,
                y: data_y
            })

        run_time = time.time() - tic
        total_loss += l
        print('batch_idx:%d/%d, loss:%.5f, data_time:%.3f, run_time:%.3f' %
              (batch_idx + 1, num_batch, l, data_time, run_time))
    total_loss = total_loss / num_batch
    return total_loss
Ejemplo n.º 2
0
def main(hf,
         obj_hfs,
         f_type,
         reduction_dim=512,
         centers_num=32,
         kernel_size=1,
         capl_l=16,
         capl_s=1,
         d_w2v=512,
         output_dim=512,
         batch_size=64,
         total_epoch=args.epoch,
         file=None,
         obj_file=None,
         outdir='youtube',
         obj_num=5):
    print('main: batch_size = %d, obj_num=%d' % (batch_size, obj_num))
    # Create vocabulary
    obj_num_0 = len(obj_hfs)

    capl = capl_l
    v2i, train_data, val_data, test_data = SeqVladDataUtil.create_vocabulary_word2vec_minmax(
        file,
        capl_max=capl_l,
        capl_min=capl_s,
        v2i={
            '': 0,
            'UNK': 1,
            'BOS': 2,
            'EOS': 3
        })

    i2v = {i: v for v, i in v2i.items()}

    print('building model ...')
    voc_size = len(v2i)

    input_video = tf.placeholder(tf.float32,
                                 shape=(None, ) + feature_shape,
                                 name='input_video')
    input_object = tf.placeholder(tf.float32,
                                  shape=(None, obj_num) + feature_shape,
                                  name='input_video')
    input_captions = tf.placeholder(tf.int32,
                                    shape=(None, capl),
                                    name='input_captions')
    y = tf.placeholder(tf.int32, shape=(None, capl))

    if args.soft:
        captionModel = SamModel.SoftModel(input_video,
                                          input_object,
                                          input_captions,
                                          voc_size,
                                          d_w2v,
                                          output_dim,
                                          reduction_dim=reduction_dim,
                                          centers_num=centers_num,
                                          done_token=v2i['EOS'],
                                          max_len=capl,
                                          beamsearch_batchsize=1,
                                          beam_size=5)

    predict_score, loss_mask, finished_beam, logprobs_finished_beams, past_symbols = captionModel.build_model(
    )

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                          logits=predict_score)

    loss = tf.reduce_sum(loss, reduction_indices=[-1]) / tf.reduce_sum(
        loss_mask, reduction_indices=[-1])

    loss = tf.reduce_mean(loss)

    optimizer = tf.train.AdamOptimizer(learning_rate=args.lr,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08,
                                       use_locking=False,
                                       name='Adam')

    gvs = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var)
                  for grad, var in gvs]
    train = optimizer.apply_gradients(capped_gvs)

    ###configure && runtime environment

    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.9
    config.log_device_placement = False

    sess = tf.Session(config=config)

    init = tf.global_variables_initializer()
    sess.run(init)

    #export_path = 'saved_model/youtube/'+f_type+'/'+'lr'+str(args.lr)+'_B'+str(batch_size)
    export_path = outdir + '/' + f_type + '_lr' + str(
        args.lr) + '_B' + str(batch_size)

    with sess.as_default():
        saver = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
        if args.pretrained_model is not None:
            saver.restore(sess, args.pretrained_model)
            print('restore pre trained file:' + args.pretrained_model)
        else:
            print('No pretrained model and training from scratch')

        for epoch in xrange(total_epoch):

            print('Epoch: %d/%d, Batch_size: %d' %
                  (epoch + 1, total_epoch, batch_size))
            # train phase
            tic = time.time()
            total_loss = exe_train(sess,
                                   train_data,
                                   epoch,
                                   batch_size,
                                   v2i,
                                   hf,
                                   obj_hfs,
                                   feature_shape,
                                   train,
                                   loss,
                                   input_video,
                                   input_object,
                                   input_captions,
                                   y,
                                   capl=capl,
                                   obj_file=obj_file,
                                   obj_num=obj_num)

            print('--Train--, Loss: %.5f, .......Time:%.3f' %
                  (total_loss, time.time() - tic))

            # tic = time.time()
            # js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape,
            # 							predict_words, input_video, input_captions, y, step=step, capl=capl)
            # print('    --Val--, .......Time:%.3f' %(time.time()-tic))

            if not os.path.exists(export_path + '/model'):
                os.makedirs(export_path + '/model')
                print('mkdir %s' % export_path + '/model')

            save_path = saver.save(
                sess, export_path + '/model/' + 'E' + str(epoch + 1) + '_L' +
                str(total_loss) + '.ckpt')
            print("Model saved in file: %s" % save_path)

            # #do beamsearch
            tic = time.time()
            js = beamsearch_exe_test(sess,
                                     test_data,
                                     1,
                                     v2i,
                                     i2v,
                                     hf,
                                     obj_hfs,
                                     feature_shape,
                                     predict_score,
                                     input_video,
                                     input_object,
                                     input_captions,
                                     y,
                                     finished_beam,
                                     logprobs_finished_beams,
                                     past_symbols,
                                     capl=capl,
                                     obj_file=obj_file,
                                     obj_num=obj_num)

            print('    --Val--, .......Time:%.3f' % (time.time() - tic))

            #save model
            if not os.path.exists(export_path + '/res'):
                os.makedirs(export_path + '/res')
                print('mkdir %s' % export_path + '/res')

            # eval
            res_path = export_path + '/res/E' + str(epoch + 1) + '.json'
            evaluate_mode_by_shell(res_path, js)
Ejemplo n.º 3
0
def main(hf,
         obj_hfs,
         f_type,
         reduction_dim=512,
         centers_num=32,
         kernel_size=1,
         capl_l=16,
         capl_s=1,
         d_w2v=512,
         output_dim=512,
         batch_size=64,
         total_epoch=args.epoch,
         file=None,
         obj_file=None,
         obj_file_rev=None,
         rel_file=None,
         saveprefix='youtube',
         beam_size=5,
         obj_num=5):

    print('main: batch_size = %d' % batch_size)
    # Create vocabulary
    timesteps_v = feature_shape[0]
    fetaure_dim = feature_shape[1]
    capl = capl_l
    v2i, train_data, val_data, test_data = SeqVladDataUtil.create_vocabulary_word2vec_minmax(
        file,
        capl_max=capl_l,
        capl_min=capl_s,
        v2i={
            '': 0,
            'UNK': 1,
            'BOS': 2,
            'EOS': 3
        })
    i2v = {i: v for v, i in v2i.items()}

    voc_size = len(v2i)

    #configure && runtime environment
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 1.0
    config.log_device_placement = False

    model_path1 = 'saved_model/msvd/youtube_sample40_objSim_NMS0.3_WV_att_NoShare_resnet200_res5c_relu_capl15s3/soft_capl15s3_dw2v512512_c64_redu512_lr0.0001_B16/model/E3_L2.218330876083131.ckpt'
    model_path2 = 'saved_model/msvd/youtube_sample40_objSim_NMS0.3_reverse_WV_att_NoShare_resnet200_res5c_relu_capl15s3/soft_capl15s3_dw2v512512_c64_redu512_lr0.0001_B16/model/E8_L1.6779830772685718.ckpt'
    model_path3 = 'saved_model/msvd/p2_r_g_RF2_Redu/youtube_sample40_objSim_NMS0.3_reverse_WV_att_NoShare_resnet200_res5c_relu_capl15s3/soft_capl15s3_dw2v512512_c64_redu512_lr0.0001_B16/model/E6_L1.823530844582274.ckpt'
    g1 = tf.Graph()
    sess1 = tf.Session(config=config, graph=g1)
    with sess1.as_default():
        with g1.as_default():
            input_video1 = tf.placeholder(tf.float32,
                                          shape=(None, ) + feature_shape,
                                          name='input_video')
            input_object1 = tf.placeholder(tf.float32,
                                           shape=(None, obj_num) +
                                           feature_shape,
                                           name='input_object')
            input_captions1 = tf.placeholder(tf.int32,
                                             shape=(None, capl),
                                             name='input_captions')
            dec_x_t_1 = tf.placeholder(tf.int32,
                                       shape=(None, None),
                                       name='dec_x_t')
            dec_h_tm1_1 = tf.placeholder(tf.float32,
                                         shape=(None, output_dim),
                                         name='dec_h_tm1')
            dec_v_in_feature1 = tf.placeholder(
                tf.float32,
                shape=(None, timesteps_v, reduction_dim * centers_num),
                name='dec_v_in_feature')
            dec_o_in_feature1 = tf.placeholder(
                tf.float32,
                shape=(obj_num, None, timesteps_v,
                       reduction_dim * centers_num),
                name='dec_o_in_feature')
            captionModel1 = SamModel.SoftModel(input_video1,
                                               input_object1,
                                               input_captions1,
                                               dec_x_t_1,
                                               dec_h_tm1_1,
                                               dec_v_in_feature1,
                                               dec_o_in_feature1,
                                               voc_size,
                                               d_w2v,
                                               output_dim,
                                               reduction_dim=reduction_dim,
                                               centers_num=centers_num,
                                               done_token=v2i['EOS'],
                                               max_len=capl,
                                               beamsearch_batchsize=1,
                                               beam_size=5)
            out_enc1, out_dec1 = captionModel1.build_model_test()
            ## out_enc: (last_output1, f_vlad1, b_vlad1)
            ## out_dec: (dec_h1, dec_logprobs1)
            init = tf.global_variables_initializer()
            sess1.run(init)
            saver1 = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
            saver1.restore(sess1, model_path1)

    g2 = tf.Graph()
    sess2 = tf.Session(config=config, graph=g2)
    with sess2.as_default():
        with g2.as_default():
            input_video2 = tf.placeholder(tf.float32,
                                          shape=(None, ) + feature_shape,
                                          name='input_video')
            input_object2 = tf.placeholder(tf.float32,
                                           shape=(None, obj_num) +
                                           feature_shape,
                                           name='input_object')
            input_captions2 = tf.placeholder(tf.int32,
                                             shape=(None, capl),
                                             name='input_captions')
            dec_x_t_2 = tf.placeholder(tf.int32,
                                       shape=(None, None),
                                       name='dec_x_t')
            dec_h_tm1_2 = tf.placeholder(tf.float32,
                                         shape=(None, output_dim),
                                         name='dec_h_tm1')
            dec_v_in_feature2 = tf.placeholder(
                tf.float32,
                shape=(None, timesteps_v, reduction_dim * centers_num),
                name='dec_v_in_feature')
            dec_o_in_feature2 = tf.placeholder(
                tf.float32,
                shape=(obj_num, None, timesteps_v,
                       reduction_dim * centers_num),
                name='dec_o_in_feature')
            captionModel2 = SamModel.SoftModel(input_video2,
                                               input_object2,
                                               input_captions2,
                                               dec_x_t_2,
                                               dec_h_tm1_2,
                                               dec_v_in_feature2,
                                               dec_o_in_feature2,
                                               voc_size,
                                               d_w2v,
                                               output_dim,
                                               reduction_dim=reduction_dim,
                                               centers_num=centers_num,
                                               done_token=v2i['EOS'],
                                               max_len=capl,
                                               beamsearch_batchsize=1,
                                               beam_size=5)
            out_enc2, out_dec2 = captionModel2.build_model_test()
            init = tf.global_variables_initializer()
            sess2.run(init)
            saver2 = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
            saver2.restore(sess2, model_path2)

    g3 = tf.Graph()
    sess3 = tf.Session(config=config, graph=g3)
    with sess3.as_default():
        with g3.as_default():
            input_video3 = tf.placeholder(tf.float32,
                                          shape=(None, ) + feature_shape,
                                          name='input_video')
            #input_object3 = tf.placeholder(tf.float32, shape=(None, obj_num)+feature_shape,name='input_object')
            input_captions3 = tf.placeholder(tf.int32,
                                             shape=(None, capl),
                                             name='input_captions')
            input_relation3 = tf.placeholder(tf.float32,
                                             shape=(None, timesteps_v, obj_num,
                                                    obj_num),
                                             name='input_relation')
            input_object_no_align3 = tf.placeholder(
                tf.float32,
                shape=(None, obj_num) + feature_shape,
                name='input_video_no_align')
            actual_batch3 = tf.placeholder(tf.int32,
                                           shape=(None),
                                           name='actual_batch')

            dec_x_t_3 = tf.placeholder(tf.int32,
                                       shape=(None, None),
                                       name='dec_x_t')
            dec_h_tm1_3 = tf.placeholder(tf.float32,
                                         shape=(None, output_dim),
                                         name='dec_h_tm1')
            dec_v_in_feature3 = tf.placeholder(
                tf.float32,
                shape=(None, timesteps_v, reduction_dim * centers_num),
                name='dec_v_in_feature')
            dec_r_in_feature3 = tf.placeholder(tf.float32,
                                               shape=(None, timesteps_v,
                                                      reduction_dim),
                                               name='dec_r_in_feature')
            #dec_o_in_feature3 = tf.placeholder(tf.float32, shape=(obj_num, None, timesteps_v, reduction_dim*centers_num), name='dec_o_in_feature')
            captionModel3 = SamModel_R.SoftModel(input_object_no_align3,
                                                 actual_batch3,
                                                 input_relation3,
                                                 input_video3,
                                                 input_captions3,
                                                 dec_x_t_3,
                                                 dec_h_tm1_3,
                                                 dec_v_in_feature3,
                                                 dec_r_in_feature3,
                                                 voc_size,
                                                 d_w2v,
                                                 output_dim,
                                                 reduction_dim=reduction_dim,
                                                 centers_num=centers_num,
                                                 done_token=v2i['EOS'],
                                                 max_len=capl,
                                                 beamsearch_batchsize=1,
                                                 beam_size=5)
            out_enc3, out_dec3 = captionModel3.build_model_test()
            init = tf.global_variables_initializer()
            sess3.run(init)
            saver3 = tf.train.Saver(sharded=True, max_to_keep=total_epoch)
            saver3.restore(sess3, model_path3)
    '''
	sess3 = tf.Session(config=config)
	with sess3.as_default():
		init = tf.global_variables_initializer()
		sess3.run(init)
	'''
    def perform_fusion(w1, w2, w3):
        batch_size = 1
        data = test_data

        caption_output = []
        total_data = len(data)
        num_batch = int(round(total_data * 1.0 / batch_size))

        for batch_idx in xrange(num_batch):
            batch_caption = data[batch_idx *
                                 batch_size:min((batch_idx + 1) *
                                                batch_size, total_data)]
            actual_batch_size = len(batch_caption)  #fyt

            data_v, data_obj = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape, obj_file)
            _, data_obj_rev = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape,
                obj_file_rev)

            _, data_r, data_obj_no_align = SeqVladDataUtil_R.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape,
                rel_file)  #fyt

            data_c, data_y = SeqVladDataUtil.getBatchTestCaptionWithSparseLabel(
                batch_caption, v2i, capl=capl)

            #out_enc_g1 = sess1.run(out_enc1, feed_dict={input_video1:data_v, input_captions1:data_c})
            out_enc_g1 = sess1.run(out_enc1,
                                   feed_dict={
                                       input_video1: data_v,
                                       input_object1: data_obj
                                   })
            (last_output1, v_enc_out1, o_enc_out1) = out_enc_g1
            assert (last_output1.shape[0] == 1)

            #out_enc_g2 = sess2.run(out_enc2, feed_dict={input_video2:data_v, input_captions2:data_c})
            out_enc_g2 = sess2.run(out_enc2,
                                   feed_dict={
                                       input_video2: data_v,
                                       input_object2: data_obj_rev
                                   })
            (last_output2, v_enc_out2, o_enc_out2) = out_enc_g2
            assert (last_output2.shape[0] == 1)
            assert (last_output2.shape[1] == output_dim)

            out_enc_g3 = sess3.run(out_enc3,
                                   feed_dict={
                                       input_video3: data_v,
                                       input_relation3: data_r,
                                       input_object_no_align3:
                                       data_obj_no_align,
                                       actual_batch3: actual_batch_size
                                   })
            (last_output3, v_enc_out3, r_enc_out3) = out_enc_g3

            x_0 = data_c[:, 0]
            x_0 = np.expand_dims(x_0, axis=-1)
            x_0 = np.tile(x_0, [1, beam_size])
            h_0_1 = np.expand_dims(last_output1, axis=1)
            h_0_1 = np.reshape(np.tile(h_0_1, [1, beam_size, 1]),
                               (beam_size, output_dim))

            h_0_2 = np.expand_dims(last_output2, axis=1)
            h_0_2 = np.reshape(np.tile(h_0_2, [1, beam_size, 1]),
                               (beam_size, output_dim))

            h_0_3 = np.expand_dims(last_output3, axis=1)
            h_0_3 = np.reshape(np.tile(h_0_3, [1, beam_size, 1]),
                               (beam_size, output_dim))

            x_t = x_0
            h_tm1_1 = h_0_1
            h_tm1_2 = h_0_2
            h_tm1_3 = h_0_3

            finished_beams = np.zeros((batch_size, capl),
                                      dtype=np.int32)  # shape [1, capl]
            logprobs_finished_beams = np.ones(
                (batch_size, ), dtype=np.float32) * float('inf')  # shape [1,]

            for time in range(capl):
                ###
                out_dec_g1 = sess1.run(out_dec1,
                                       feed_dict={
                                           dec_x_t_1: x_t,
                                           dec_h_tm1_1: h_tm1_1,
                                           dec_v_in_feature1: v_enc_out1,
                                           dec_o_in_feature1: o_enc_out1
                                       })
                (h_t_1, logprobs1) = out_dec_g1

                out_dec_g2 = sess2.run(out_dec2,
                                       feed_dict={
                                           dec_x_t_2: x_t,
                                           dec_h_tm1_2: h_tm1_2,
                                           dec_v_in_feature2: v_enc_out2,
                                           dec_o_in_feature2: o_enc_out2
                                       })
                (h_t_2, logprobs2) = out_dec_g2

                #fyt
                out_dec_g3 = sess3.run(out_dec3,
                                       feed_dict={
                                           dec_x_t_3: x_t,
                                           dec_h_tm1_3: h_tm1_3,
                                           dec_v_in_feature3: v_enc_out3,
                                           dec_r_in_feature3: r_enc_out3
                                       })
                #dec_v_in_feature3:v_enc_out3, dec_o_in_feature3:o_enc_out3})
                (h_t_3, logprobs3) = out_dec_g3

                ###
                logprobs = w1 * logprobs1 + w2 * logprobs2 + w3 * logprobs3  # shape [beam_size, voc_size]

                if time == 0:
                    logprobs_batched = np.reshape(logprobs,
                                                  [-1, beam_size, voc_size])
                    t_logprobs = logprobs_batched[:,
                                                  0, :]  # shape [1, voc_size]
                    desc_ind = np.argsort(-t_logprobs, axis=1)
                    topk_indices = desc_ind[0, :
                                            beam_size]  # shape [beam_size, ]
                    past_logprobs = t_logprobs[:,
                                               topk_indices]  # shape [1, beam_size]
                    topk_indices = np.reshape(topk_indices, [1, beam_size])
                    #past_logprobs, topk_indices = tf.nn.top_k(logprobs_batched[:,0,:], beam_size)
                else:
                    logprobs = np.reshape(logprobs, [-1, beam_size, voc_size])
                    logprobs = logprobs + np.expand_dims(
                        past_logprobs,
                        axis=2)  # shape [1, beam_size, voc_size]
                    t_logprobs = np.reshape(logprobs,
                                            [1, beam_size * voc_size])
                    desc_ind = np.argsort(-t_logprobs, axis=1)
                    topk_indices = desc_ind[0, :
                                            beam_size]  # shape [beam_size, ]
                    past_logprobs = t_logprobs[:,
                                               topk_indices]  # shape [1, beam_size]
                    topk_indices = np.reshape(topk_indices, [1, beam_size])
                    #past_logprobs, topk_indices = tf.nn.top_k(
                    #	tf.reshape(logprobs, [1, beam_size * voc_size]),
                    #	beam_size,
                    #	sorted=False
                    #)
                symbols = topk_indices % voc_size
                symbols = np.reshape(symbols,
                                     [1, beam_size])  # shape [1, beam_size]
                parent_refs = topk_indices // voc_size
                parent_refs = np.reshape(parent_refs, [-1])
                #h_1 = tf.gather(h_t_1,  tf.reshape(parent_refs,[-1]))
                #h_2 = tf.gather(h_t_2,  tf.reshape(parent_refs,[-1]))
                h_1 = h_t_1[parent_refs]
                h_2 = h_t_2[parent_refs]
                h_3 = h_t_3[parent_refs]
                done_token = v2i['EOS']

                if time == 0:
                    past_symbols = np.concatenate([
                        np.expand_dims(symbols, axis=2),
                        np.zeros(
                            (batch_size, beam_size, capl - 1), dtype=np.int32)
                    ],
                                                  axis=-1)
                    # shape [1, beam_size, capl]
                else:
                    past_symbols_batch_major = np.reshape(
                        past_symbols[:, :, 0:time],
                        [-1, time])  # shape [beam_size, time]
                    #beam_past_symbols = tf.gather(past_symbols_batch_major,  parent_refs)
                    beam_past_symbols = np.reshape(
                        past_symbols_batch_major[parent_refs],
                        [-1, beam_size, time])  # shape [1, beam_size, time]
                    past_symbols = np.concatenate([
                        beam_past_symbols,
                        np.expand_dims(symbols, axis=2),
                        np.zeros(
                            (1, beam_size, capl - time - 1), dtype=np.int32)
                    ],
                                                  axis=2)
                    past_symbols = np.reshape(past_symbols,
                                              [1, beam_size, capl])

                    cond1 = np.equal(symbols,
                                     np.ones(symbols.shape, dtype=np.int32) *
                                     done_token)  # condition on done sentence
                    # cond1: shape [1, beam_size]

                    #for_finished_logprobs = tf.where(cond1,past_logprobs,tf.ones_like(past_logprobs,tf.float32)* -1e5)
                    for_finished_logprobs = np.where(
                        cond1, past_logprobs,
                        np.ones(past_logprobs.shape, np.float32) *
                        -1e5)  # shape [1, beam_size]

                    #done_indice_max = tf.cast(tf.argmax(for_finished_logprobs,axis=-1),tf.int32) # shape [1,]
                    #logprobs_done_max = tf.reduce_max(for_finished_logprobs,reduction_indices=-1) # shape [1,]
                    #done_past_symbols = tf.gather(tf.reshape(past_symbols,[beam_size,capl]),done_indice_max) # shape [1, capl]
                    #logprobs_done_max = tf.div(-logprobs_done_max,tf.cast(time,tf.float32))
                    #cond2 = tf.greater(logprobs_finished_beams,logprobs_done_max) # shape [1,]

                    #cond3 = tf.equal(done_past_symbols[:,time],done_token)
                    #cond4 = tf.equal(time,capl-1)
                    #finished_beams = tf.where(tf.logical_and(cond2,tf.logical_or(cond3,cond4)),
                    #															done_past_symbols,
                    #															finished_beams)
                    #logprobs_finished_beams = tf.where(tf.logical_and(cond2,tf.logical_or(cond3,cond4)),
                    #							logprobs_done_max,
                    #							logprobs_finished_beams)

                    done_indice_max = int(
                        np.argmax(
                            np.reshape(for_finished_logprobs,
                                       [beam_size])))  # int32
                    logprobs_done_max = for_finished_logprobs[:,
                                                              done_indice_max]  # shape [1,]
                    done_past_symbols = past_symbols[:,
                                                     done_indice_max, :]  # shape [1, capl]
                    logprobs_done_max = -logprobs_done_max / float(
                        time)  # shape [1, ]
                    cond2 = np.greater(logprobs_finished_beams,
                                       logprobs_done_max)  # shape [1,]
                    cond3 = np.equal(done_past_symbols[:, time],
                                     done_token)  # shape [1, ]
                    cond4 = np.equal(time, capl - 1)  # bool
                    finished_beams = np.where(cond2 and (cond3 or cond4),
                                              done_past_symbols,
                                              finished_beams)
                    logprobs_finished_beams = np.where(
                        cond2 and (cond3 or cond4), logprobs_done_max,
                        logprobs_finished_beams)

                x_t = symbols
                h_tm1_1 = h_1
                h_tm1_2 = h_2
                h_tm1_3 = h_3

            #fb = sess3.run(finished_beams)
            fb = finished_beams
            generated_captions = SeqVladDataUtil.convertCaptionI2V(
                batch_caption, fb, i2v)

            for idx, sen in enumerate(generated_captions):
                print('%s : %s' % (batch_caption[idx].keys()[0], sen))
                caption_output.append({
                    'image_id': batch_caption[idx].keys()[0],
                    'caption': sen
                })

        js = {}
        js['val_predictions'] = caption_output

        res_path = 'fusion/RF2_Redu_E6_res/%s_w1(%.2f)_w2(%.2f)_w3(%.2f).json' % (
            saveprefix, w1, w2, w3)
        evaluate_mode_by_shell(res_path, js)

    w1 = 1.0
    w2 = 1.0
    w3 = 0.5
    perform_fusion(w1, w2, w3)
    '''
	for w3_i in range(5):
		w3 = (w3_i+1) * 0.2
		print('now w1 = %f, w2 = %f, w3 = %f' % (w1,w2,w3))
		perform_fusion(w1, w2, w3)
	'''
    #perform_fusion(1.0, 1.0)
    #perform_fusion(1.0, 0)
    #perform_fusion(0, 1.0)
    '''
	for w3_i in range(5):
		w3 = (w3_i+1) * 0.2
		print('now w1 = %f, w2 = %f, w3 = %f' % (w1,w2,w3))
		perform_fusion(w1, w2, w3)
	'''
    '''
Ejemplo n.º 4
0
def beamsearch_exe_test(sess,
                        data,
                        batch_size,
                        v2i,
                        i2v,
                        hf,
                        obj_hfs,
                        feature_shape,
                        predict_words,
                        input_video,
                        input_object,
                        input_captions,
                        y,
                        finished_beam,
                        logprobs_finished_beams,
                        past_symbols,
                        capl=16,
                        obj_file=None,
                        obj_num=5):

    caption_output = []
    total_data = len(data)
    num_batch = int(round(total_data * 1.0 / batch_size))
    #num_batch = 5
    for batch_idx in xrange(num_batch):
        batch_caption = data[batch_idx *
                             batch_size:min((batch_idx + 1) *
                                            batch_size, total_data)]

        if args.step:
            #data_v = SeqVladDataUtil.getBatchVideoFeature(batch_caption,hf,(timesteps_v,feature_shape[1],7,7))
            #data_v = data_v[:,0::4]
            assert (1 == 0)
        else:
            data_v, data_obj = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape, obj_file)
            assert (data_obj.shape[1] == obj_num and len(data_obj.shape) == 6)
            assert (data_v.shape[1] == data_obj.shape[2])
        data_c, data_y = SeqVladDataUtil.getBatchTestCaptionWithSparseLabel(
            batch_caption, v2i, capl=capl)
        [fb, lfb, ps] = sess.run(
            [finished_beam, logprobs_finished_beams, past_symbols],
            feed_dict={
                input_video: data_v,
                input_object: data_obj,
                input_captions: data_c,
                y: data_y
            })

        generated_captions = SeqVladDataUtil.convertCaptionI2V(
            batch_caption, fb, i2v)

        for idx, sen in enumerate(generated_captions):
            print('%s : %s' % (batch_caption[idx].keys()[0], sen))
            caption_output.append({
                'image_id': batch_caption[idx].keys()[0],
                'caption': sen
            })

    js = {}
    js['val_predictions'] = caption_output

    return js
Ejemplo n.º 5
0
    def perform_fusion(w1, w2, w3):
        batch_size = 1
        data = test_data

        caption_output = []
        total_data = len(data)
        num_batch = int(round(total_data * 1.0 / batch_size))

        for batch_idx in xrange(num_batch):
            batch_caption = data[batch_idx *
                                 batch_size:min((batch_idx + 1) *
                                                batch_size, total_data)]
            actual_batch_size = len(batch_caption)  #fyt

            data_v, data_obj = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape, obj_file)
            _, data_obj_rev = SeqVladDataUtil.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape,
                obj_file_rev)

            _, data_r, data_obj_no_align = SeqVladDataUtil_R.getBatchVideoObjectFeature2(
                batch_caption, hf, obj_hfs, obj_num, feature_shape,
                rel_file)  #fyt

            data_c, data_y = SeqVladDataUtil.getBatchTestCaptionWithSparseLabel(
                batch_caption, v2i, capl=capl)

            #out_enc_g1 = sess1.run(out_enc1, feed_dict={input_video1:data_v, input_captions1:data_c})
            out_enc_g1 = sess1.run(out_enc1,
                                   feed_dict={
                                       input_video1: data_v,
                                       input_object1: data_obj
                                   })
            (last_output1, v_enc_out1, o_enc_out1) = out_enc_g1
            assert (last_output1.shape[0] == 1)

            #out_enc_g2 = sess2.run(out_enc2, feed_dict={input_video2:data_v, input_captions2:data_c})
            out_enc_g2 = sess2.run(out_enc2,
                                   feed_dict={
                                       input_video2: data_v,
                                       input_object2: data_obj_rev
                                   })
            (last_output2, v_enc_out2, o_enc_out2) = out_enc_g2
            assert (last_output2.shape[0] == 1)
            assert (last_output2.shape[1] == output_dim)

            out_enc_g3 = sess3.run(out_enc3,
                                   feed_dict={
                                       input_video3: data_v,
                                       input_relation3: data_r,
                                       input_object_no_align3:
                                       data_obj_no_align,
                                       actual_batch3: actual_batch_size
                                   })
            (last_output3, v_enc_out3, r_enc_out3) = out_enc_g3

            x_0 = data_c[:, 0]
            x_0 = np.expand_dims(x_0, axis=-1)
            x_0 = np.tile(x_0, [1, beam_size])
            h_0_1 = np.expand_dims(last_output1, axis=1)
            h_0_1 = np.reshape(np.tile(h_0_1, [1, beam_size, 1]),
                               (beam_size, output_dim))

            h_0_2 = np.expand_dims(last_output2, axis=1)
            h_0_2 = np.reshape(np.tile(h_0_2, [1, beam_size, 1]),
                               (beam_size, output_dim))

            h_0_3 = np.expand_dims(last_output3, axis=1)
            h_0_3 = np.reshape(np.tile(h_0_3, [1, beam_size, 1]),
                               (beam_size, output_dim))

            x_t = x_0
            h_tm1_1 = h_0_1
            h_tm1_2 = h_0_2
            h_tm1_3 = h_0_3

            finished_beams = np.zeros((batch_size, capl),
                                      dtype=np.int32)  # shape [1, capl]
            logprobs_finished_beams = np.ones(
                (batch_size, ), dtype=np.float32) * float('inf')  # shape [1,]

            for time in range(capl):
                ###
                out_dec_g1 = sess1.run(out_dec1,
                                       feed_dict={
                                           dec_x_t_1: x_t,
                                           dec_h_tm1_1: h_tm1_1,
                                           dec_v_in_feature1: v_enc_out1,
                                           dec_o_in_feature1: o_enc_out1
                                       })
                (h_t_1, logprobs1) = out_dec_g1

                out_dec_g2 = sess2.run(out_dec2,
                                       feed_dict={
                                           dec_x_t_2: x_t,
                                           dec_h_tm1_2: h_tm1_2,
                                           dec_v_in_feature2: v_enc_out2,
                                           dec_o_in_feature2: o_enc_out2
                                       })
                (h_t_2, logprobs2) = out_dec_g2

                #fyt
                out_dec_g3 = sess3.run(out_dec3,
                                       feed_dict={
                                           dec_x_t_3: x_t,
                                           dec_h_tm1_3: h_tm1_3,
                                           dec_v_in_feature3: v_enc_out3,
                                           dec_r_in_feature3: r_enc_out3
                                       })
                #dec_v_in_feature3:v_enc_out3, dec_o_in_feature3:o_enc_out3})
                (h_t_3, logprobs3) = out_dec_g3

                ###
                logprobs = w1 * logprobs1 + w2 * logprobs2 + w3 * logprobs3  # shape [beam_size, voc_size]

                if time == 0:
                    logprobs_batched = np.reshape(logprobs,
                                                  [-1, beam_size, voc_size])
                    t_logprobs = logprobs_batched[:,
                                                  0, :]  # shape [1, voc_size]
                    desc_ind = np.argsort(-t_logprobs, axis=1)
                    topk_indices = desc_ind[0, :
                                            beam_size]  # shape [beam_size, ]
                    past_logprobs = t_logprobs[:,
                                               topk_indices]  # shape [1, beam_size]
                    topk_indices = np.reshape(topk_indices, [1, beam_size])
                    #past_logprobs, topk_indices = tf.nn.top_k(logprobs_batched[:,0,:], beam_size)
                else:
                    logprobs = np.reshape(logprobs, [-1, beam_size, voc_size])
                    logprobs = logprobs + np.expand_dims(
                        past_logprobs,
                        axis=2)  # shape [1, beam_size, voc_size]
                    t_logprobs = np.reshape(logprobs,
                                            [1, beam_size * voc_size])
                    desc_ind = np.argsort(-t_logprobs, axis=1)
                    topk_indices = desc_ind[0, :
                                            beam_size]  # shape [beam_size, ]
                    past_logprobs = t_logprobs[:,
                                               topk_indices]  # shape [1, beam_size]
                    topk_indices = np.reshape(topk_indices, [1, beam_size])
                    #past_logprobs, topk_indices = tf.nn.top_k(
                    #	tf.reshape(logprobs, [1, beam_size * voc_size]),
                    #	beam_size,
                    #	sorted=False
                    #)
                symbols = topk_indices % voc_size
                symbols = np.reshape(symbols,
                                     [1, beam_size])  # shape [1, beam_size]
                parent_refs = topk_indices // voc_size
                parent_refs = np.reshape(parent_refs, [-1])
                #h_1 = tf.gather(h_t_1,  tf.reshape(parent_refs,[-1]))
                #h_2 = tf.gather(h_t_2,  tf.reshape(parent_refs,[-1]))
                h_1 = h_t_1[parent_refs]
                h_2 = h_t_2[parent_refs]
                h_3 = h_t_3[parent_refs]
                done_token = v2i['EOS']

                if time == 0:
                    past_symbols = np.concatenate([
                        np.expand_dims(symbols, axis=2),
                        np.zeros(
                            (batch_size, beam_size, capl - 1), dtype=np.int32)
                    ],
                                                  axis=-1)
                    # shape [1, beam_size, capl]
                else:
                    past_symbols_batch_major = np.reshape(
                        past_symbols[:, :, 0:time],
                        [-1, time])  # shape [beam_size, time]
                    #beam_past_symbols = tf.gather(past_symbols_batch_major,  parent_refs)
                    beam_past_symbols = np.reshape(
                        past_symbols_batch_major[parent_refs],
                        [-1, beam_size, time])  # shape [1, beam_size, time]
                    past_symbols = np.concatenate([
                        beam_past_symbols,
                        np.expand_dims(symbols, axis=2),
                        np.zeros(
                            (1, beam_size, capl - time - 1), dtype=np.int32)
                    ],
                                                  axis=2)
                    past_symbols = np.reshape(past_symbols,
                                              [1, beam_size, capl])

                    cond1 = np.equal(symbols,
                                     np.ones(symbols.shape, dtype=np.int32) *
                                     done_token)  # condition on done sentence
                    # cond1: shape [1, beam_size]

                    #for_finished_logprobs = tf.where(cond1,past_logprobs,tf.ones_like(past_logprobs,tf.float32)* -1e5)
                    for_finished_logprobs = np.where(
                        cond1, past_logprobs,
                        np.ones(past_logprobs.shape, np.float32) *
                        -1e5)  # shape [1, beam_size]

                    #done_indice_max = tf.cast(tf.argmax(for_finished_logprobs,axis=-1),tf.int32) # shape [1,]
                    #logprobs_done_max = tf.reduce_max(for_finished_logprobs,reduction_indices=-1) # shape [1,]
                    #done_past_symbols = tf.gather(tf.reshape(past_symbols,[beam_size,capl]),done_indice_max) # shape [1, capl]
                    #logprobs_done_max = tf.div(-logprobs_done_max,tf.cast(time,tf.float32))
                    #cond2 = tf.greater(logprobs_finished_beams,logprobs_done_max) # shape [1,]

                    #cond3 = tf.equal(done_past_symbols[:,time],done_token)
                    #cond4 = tf.equal(time,capl-1)
                    #finished_beams = tf.where(tf.logical_and(cond2,tf.logical_or(cond3,cond4)),
                    #															done_past_symbols,
                    #															finished_beams)
                    #logprobs_finished_beams = tf.where(tf.logical_and(cond2,tf.logical_or(cond3,cond4)),
                    #							logprobs_done_max,
                    #							logprobs_finished_beams)

                    done_indice_max = int(
                        np.argmax(
                            np.reshape(for_finished_logprobs,
                                       [beam_size])))  # int32
                    logprobs_done_max = for_finished_logprobs[:,
                                                              done_indice_max]  # shape [1,]
                    done_past_symbols = past_symbols[:,
                                                     done_indice_max, :]  # shape [1, capl]
                    logprobs_done_max = -logprobs_done_max / float(
                        time)  # shape [1, ]
                    cond2 = np.greater(logprobs_finished_beams,
                                       logprobs_done_max)  # shape [1,]
                    cond3 = np.equal(done_past_symbols[:, time],
                                     done_token)  # shape [1, ]
                    cond4 = np.equal(time, capl - 1)  # bool
                    finished_beams = np.where(cond2 and (cond3 or cond4),
                                              done_past_symbols,
                                              finished_beams)
                    logprobs_finished_beams = np.where(
                        cond2 and (cond3 or cond4), logprobs_done_max,
                        logprobs_finished_beams)

                x_t = symbols
                h_tm1_1 = h_1
                h_tm1_2 = h_2
                h_tm1_3 = h_3

            #fb = sess3.run(finished_beams)
            fb = finished_beams
            generated_captions = SeqVladDataUtil.convertCaptionI2V(
                batch_caption, fb, i2v)

            for idx, sen in enumerate(generated_captions):
                print('%s : %s' % (batch_caption[idx].keys()[0], sen))
                caption_output.append({
                    'image_id': batch_caption[idx].keys()[0],
                    'caption': sen
                })

        js = {}
        js['val_predictions'] = caption_output

        res_path = 'fusion/RF2_Redu_E6_res/%s_w1(%.2f)_w2(%.2f)_w3(%.2f).json' % (
            saveprefix, w1, w2, w3)
        evaluate_mode_by_shell(res_path, js)